From ee5a45f8e9f416e603ce1c3ba9d010151ce6663e Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Tue, 28 Mar 2023 19:28:07 -0400 Subject: [PATCH 01/28] Fano --- dynamo/preprocessing/preprocessor_utils.py | 104 +++++++++++++++++++-- 1 file changed, 97 insertions(+), 7 deletions(-) diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index 5a15bc9a4..52317eb8b 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -1,5 +1,5 @@ import warnings -from typing import Callable, List, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union try: from typing import Literal @@ -495,6 +495,88 @@ def select_genes_by_dispersion_svr( return mean.flatten(), variance, highly_variable_mask, highly_variable_scores +# Highly variable gene selection function: +def get_highvar_genes_sparse( + expression: Union[ + np.ndarray, + scipy.sparse.csr_matrix, + scipy.sparse.csc_matrix, + scipy.sparse.coo_matrix, + ], + expected_fano_threshold: Optional[float] = None, + numgenes: Optional[int] = None, + minimal_mean: float = 0.5, +) -> Tuple[pd.DataFrame, Dict]: + """Find highly-variable genes in sparse single-cell data matrices. + + Args: + expression: Gene expression matrix + expected_fano_threshold: Optionally can be used to set a manual dispersion threshold (for definition of + "highly-variable") + numgenes: Optionally can be used to find the n most variable genes + minimal_mean: Sets a threshold on the minimum mean expression to consider + + Returns: + gene_counts_stats: Results dataframe containing pertinent information for each gene + gene_fano_parameters: Additional informative dictionary (w/ records of dispersion for each gene, threshold, + etc.) + """ + gene_mean = np.array(expression.mean(axis=0)).astype(float).reshape(-1) + E2 = expression.copy() + E2.data **= 2 + gene2_mean = np.array(E2.mean(axis=0)).reshape(-1) + gene_var = pd.Series(gene2_mean - (gene_mean**2)) + del E2 + gene_mean = pd.Series(gene_mean) + gene_fano = gene_var / gene_mean + + # Find parameters for expected fano line -- this line can be non-linear... + top_genes = gene_mean.sort_values(ascending=False)[:20].index + A = (np.sqrt(gene_var) / gene_mean)[top_genes].min() + + w_mean_low, w_mean_high = gene_mean.quantile([0.10, 0.90]) + w_fano_low, w_fano_high = gene_fano.quantile([0.10, 0.90]) + winsor_box = ( + (gene_fano > w_fano_low) & (gene_fano < w_fano_high) & (gene_mean > w_mean_low) & (gene_mean < w_mean_high) + ) + fano_median = gene_fano[winsor_box].median() + B = np.sqrt(fano_median) + + gene_expected_fano = (A**2) * gene_mean + (B**2) + fano_ratio = gene_fano / gene_expected_fano + + # Identify high var genes + if numgenes is not None: + highvargenes = fano_ratio.sort_values(ascending=False).index[:numgenes] + high_var_genes_ind = fano_ratio.index.isin(highvargenes) + T = None + else: + if not expected_fano_threshold: + T = 1.0 + gene_fano[winsor_box].std() + else: + T = expected_fano_threshold + + high_var_genes_ind = (fano_ratio > T) & (gene_mean > minimal_mean) + + gene_counts_stats = pd.DataFrame( + { + "mean": gene_mean, + "var": gene_var, + "fano": gene_fano, + "expected_fano": gene_expected_fano, + "high_var": high_var_genes_ind, + "fano_ratio": fano_ratio, + } + ) + gene_fano_parameters = { + "A": A, + "B": B, + "T": T, + "minimal_mean": minimal_mean, + } + return (gene_counts_stats, gene_fano_parameters) + + def SVRs( adata_ori: anndata.AnnData, filter_bool: Union[np.ndarray, None] = None, @@ -641,11 +723,19 @@ def SVRs( if svr_gamma is None: svr_gamma = 150.0 / len(mu) # Fit the Support Vector Regression - clf = SVR(gamma=svr_gamma) - clf.fit(log_m[:, None], log_cv) + clf = SVR(kernel="rbf", gamma=svr_gamma) + # clf.fit(log_m[:, None], log_cv) + + (gene_counts_stats, gene_fano_parameters) = get_highvar_genes_sparse(valid_CM) + target = np.array(gene_counts_stats["fano"]).flatten() + ground = np.array(gene_counts_stats["mean"]).flatten()[:, None] + clf.fit(ground, target) + fitted_fun = clf.predict - ff = fitted_fun(log_m[:, None]) - score = log_cv - ff + # ff = fitted_fun(log_m[:, None]) + # score = log_cv - ff + ff = fitted_fun(ground) + score = target - ff if sort_inverse: score = -score @@ -660,8 +750,8 @@ def SVRs( adata.var.loc[detected_bool, prefix + "log_cv"], adata.var.loc[detected_bool, prefix + "score"], ) = ( - np.array(log_m).flatten(), - np.array(log_cv).flatten(), + np.array(ground).flatten(), + np.array(target).flatten(), np.array(score).flatten(), ) From 36de28983b76f8ca3b489d4e1294750ea02faefe Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Wed, 29 Mar 2023 12:50:54 -0400 Subject: [PATCH 02/28] use defautl kernel --- dynamo/preprocessing/preprocess.py | 3 +++ dynamo/preprocessing/preprocessor_utils.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dynamo/preprocessing/preprocess.py b/dynamo/preprocessing/preprocess.py index a6299d7a6..55b73ecff 100755 --- a/dynamo/preprocessing/preprocess.py +++ b/dynamo/preprocessing/preprocess.py @@ -1650,6 +1650,9 @@ def select_genes_monocle_legacy( ) filter_bool = get_svr_filter(adata, layer=layer, n_top_genes=n_top_genes, return_adata=False) + print(filter_bool) + condition = filter_bool == True + print(np.where(condition)[0]) # filter genes by gene expression fraction as well adata.var["frac"], invalid_ids = compute_gene_exp_fraction(X=adata.X, threshold=exprs_frac_for_gene_exclusion) diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index 52317eb8b..9f10107c2 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -723,7 +723,7 @@ def SVRs( if svr_gamma is None: svr_gamma = 150.0 / len(mu) # Fit the Support Vector Regression - clf = SVR(kernel="rbf", gamma=svr_gamma) + clf = SVR(gamma=svr_gamma) # clf.fit(log_m[:, None], log_cv) (gene_counts_stats, gene_fano_parameters) = get_highvar_genes_sparse(valid_CM) From a55534b324da810015c77092ea8778a31d43dacd Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Fri, 31 Mar 2023 20:49:41 -0400 Subject: [PATCH 03/28] call direct select_genes_monocle --- dynamo/preprocessing/Preprocessor.py | 32 ++++++++++------------ dynamo/preprocessing/preprocessor_utils.py | 2 +- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index 1cf017e70..ad64140cb 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -35,6 +35,7 @@ log1p_adata, normalize_cell_expr_by_size_factors, select_genes_by_dispersion_general, + select_genes_monocle, ) from .utils import ( collapse_species_adata, @@ -380,7 +381,7 @@ def _pca(self, adata: AnnData) -> None: self.pca(adata, **self.pca_kwargs) def config_monocle_recipe( - self, adata: AnnData, n_top_genes: int = 2000, gene_selection_method: str = "SVR" + self, adata: AnnData, n_top_genes: int = 2000, gene_selection_method: str = "gini" ) -> None: """Automatically configure the preprocessor for monocle recipe. @@ -421,24 +422,21 @@ def config_monocle_recipe( "min_count_p": 0, "shared_count": 30, } - self.select_genes = select_genes_by_dispersion_general + self.select_genes = select_genes_monocle self.select_genes_kwargs = { - "recipe": "monocle", - "monocle_kwargs": { - "sort_by": gene_selection_method, - "n_top_genes": n_top_genes, - "keep_filtered": True, - "SVRs_kwargs": { - "min_expr_cells": 0, - "min_expr_avg": 0, - "max_expr_avg": np.inf, - "svr_gamma": None, - "winsorize": False, - "winsor_perc": (1, 99.5), - "sort_inverse": False, - }, - "only_bools": True, + "sort_by": gene_selection_method, + "n_top_genes": n_top_genes, + "keep_filtered": True, + "SVRs_kwargs": { + "min_expr_cells": 0, + "min_expr_avg": 0, + "max_expr_avg": np.inf, + "svr_gamma": None, + "winsorize": False, + "winsor_perc": (1, 99.5), + "sort_inverse": False, }, + "only_bools": True, } self.normalize_selected_genes = None self.normalize_by_cells = normalize_cell_expr_by_size_factors diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index 9f10107c2..62ce83b31 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -854,8 +854,8 @@ def select_genes_monocle( filter_bool=filter_bool, **SVRs_args, ) - filter_bool = get_svr_filter(adata, layer=layer, n_top_genes=n_top_genes, return_adata=False) + # elif sort_by == "fano": # filter genes by gene expression fraction as well adata.var["frac"], invalid_ids = compute_gene_exp_fraction(X=adata.X, threshold=exprs_frac_for_gene_exclusion) From dc00cf8b5006c94fbc977c4f5093e980f96ef6f0 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Tue, 4 Apr 2023 18:00:12 -0400 Subject: [PATCH 04/28] gene selection refactoring --- dynamo/plot/preprocess.py | 2 +- dynamo/preprocessing/Preprocessor.py | 29 +- dynamo/preprocessing/__init__.py | 11 +- dynamo/preprocessing/gene_selection.py | 986 ++++++++++++++++++ dynamo/preprocessing/preprocess.py | 43 +- .../preprocessing/preprocess_monocle_utils.py | 302 ------ dynamo/preprocessing/preprocessor_utils.py | 684 ------------ 7 files changed, 1040 insertions(+), 1017 deletions(-) create mode 100644 dynamo/preprocessing/gene_selection.py diff --git a/dynamo/plot/preprocess.py b/dynamo/plot/preprocess.py index 6f86d5e55..ca29fa358 100755 --- a/dynamo/plot/preprocess.py +++ b/dynamo/plot/preprocess.py @@ -9,7 +9,7 @@ from ..configuration import DynamoAdataKeyManager from ..dynamo_logger import main_warning from ..preprocessing import preprocess as pp -from ..preprocessing.preprocess_monocle_utils import top_table +from ..preprocessing.gene_selection import top_table from ..preprocessing.utils import detect_experiment_datatype from ..tools.utils import get_mapper, update_dict from .utils import save_fig diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index ad64140cb..6ae1d1aa5 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -21,7 +21,7 @@ sctransform, select_genes_by_pearson_residuals, ) -from ..tools.connectivity import neighbors as default_neighbors +from .gene_selection import select_genes_by_seurat_recipe, select_genes_monocle from .preprocess import normalize_cell_expr_by_size_factors_legacy, pca_monocle from .preprocessor_utils import _infer_labeling_experiment_type from .preprocessor_utils import ( @@ -34,8 +34,6 @@ is_log1p_transformed_adata, log1p_adata, normalize_cell_expr_by_size_factors, - select_genes_by_dispersion_general, - select_genes_monocle, ) from .utils import ( collapse_species_adata, @@ -57,7 +55,7 @@ def __init__( filter_genes_by_outliers_kwargs: dict = {}, normalize_by_cells_function: Callable = normalize_cell_expr_by_size_factors, normalize_by_cells_function_kwargs: dict = {}, - select_genes_function: Callable = select_genes_by_dispersion_general, + select_genes_function: Callable = select_genes_monocle, select_genes_kwargs: dict = {}, normalize_selected_genes_function: Callable = None, normalize_selected_genes_kwargs: dict = {}, @@ -381,7 +379,7 @@ def _pca(self, adata: AnnData) -> None: self.pca(adata, **self.pca_kwargs) def config_monocle_recipe( - self, adata: AnnData, n_top_genes: int = 2000, gene_selection_method: str = "gini" + self, adata: AnnData, n_top_genes: int = 2000, gene_selection_method: str = "cv_dispersion" ) -> None: """Automatically configure the preprocessor for monocle recipe. @@ -424,19 +422,20 @@ def config_monocle_recipe( } self.select_genes = select_genes_monocle self.select_genes_kwargs = { - "sort_by": gene_selection_method, "n_top_genes": n_top_genes, + "sort_by": "cv_dispersion" if gene_selection_method is None else gene_selection_method, "keep_filtered": True, "SVRs_kwargs": { + "relative_expr": True, + "total_szfactor": "total_Size_Factor", "min_expr_cells": 0, "min_expr_avg": 0, "max_expr_avg": np.inf, - "svr_gamma": None, "winsorize": False, "winsor_perc": (1, 99.5), "sort_inverse": False, + "svr_gamma": None, }, - "only_bools": True, } self.normalize_selected_genes = None self.normalize_by_cells = normalize_cell_expr_by_size_factors @@ -482,7 +481,7 @@ def preprocess_adata_monocle( temp_logger.finish_progress(progress_name="preprocess") - def config_seurat_recipe(self, adata: AnnData) -> None: + def config_seurat_recipe(self, adata: AnnData, gene_selection_method: str = "seurat_dispersion") -> None: """Automatically configure the preprocessor for using the seurat style recipe. Args: @@ -490,8 +489,11 @@ def config_seurat_recipe(self, adata: AnnData) -> None: """ self.config_monocle_recipe(adata) - self.select_genes = select_genes_by_dispersion_general - self.select_genes_kwargs = {"recipe": "seurat", "n_top_genes": 2000} + self.select_genes = select_genes_by_seurat_recipe + self.select_genes_kwargs = { + "algorithm": "seurat_dispersion" if gene_selection_method is None else gene_selection_method, + "n_top_genes": 2000, + } self.normalize_by_cells_function_kwargs = {"skip_log": True} self.pca_kwargs = {"pca_key": "X_pca"} self.filter_genes_by_outliers_kwargs = {"shared_count": 20} @@ -684,6 +686,7 @@ def preprocess_adata( recipe: Literal[ "monocle", "seurat", "sctransform", "pearson_residuals", "monocle_pearson_residuals" ] = "monocle", + gene_selection_method: Optional[str] = None, tkey: Optional[str] = None, ) -> None: """Preprocess the AnnData object with the recipe specified. @@ -698,10 +701,10 @@ def preprocess_adata( """ if recipe == "monocle": - self.config_monocle_recipe(adata) + self.config_monocle_recipe(adata, gene_selection_method=gene_selection_method) self.preprocess_adata_monocle(adata, tkey=tkey) elif recipe == "seurat": - self.config_seurat_recipe(adata) + self.config_seurat_recipe(adata, gene_selection_method=gene_selection_method) self.preprocess_adata_seurat(adata, tkey=tkey) elif recipe == "sctransform": self.config_sctransform_recipe(adata) diff --git a/dynamo/preprocessing/__init__.py b/dynamo/preprocessing/__init__.py index 4ce7baefc..90dfa63a7 100755 --- a/dynamo/preprocessing/__init__.py +++ b/dynamo/preprocessing/__init__.py @@ -5,7 +5,6 @@ from .dynast import lambda_correction from .preprocess import ( Gini, - SVRs, calc_sz_factor_legacy, filter_cells_by_outliers, filter_cells_legacy, @@ -16,7 +15,6 @@ normalize_cell_expr_by_size_factors_legacy, recipe_monocle, recipe_velocyto, - select_genes_monocle, ) from .preprocessor_utils import * from .utils import ( @@ -38,7 +36,12 @@ normalize_cells = normalize_cell_expr_by_size_factors from .CnmfPreprocessor import CnmfPreprocessor -from .preprocess_monocle_utils import estimate_dispersion, top_table +from .gene_selection import ( + estimate_dispersion, + select_genes_by_svr, + select_genes_monocle, + top_table, +) from .Preprocessor import Preprocessor __all__ = [ @@ -59,7 +62,7 @@ "filter_genes", "filter_genes_by_outliers", "filter_genes_by_clusters_", - "SVRs", + "select_genes_by_svr", "get_svr_filter", "highest_frac_genes", "cell_cycle_scores", diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py new file mode 100644 index 000000000..2af9f81bc --- /dev/null +++ b/dynamo/preprocessing/gene_selection.py @@ -0,0 +1,986 @@ +import re +import warnings +from typing import Dict, List, Optional, Tuple, Union + +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal + +import numpy as np +import pandas as pd +import scipy.sparse +import statsmodels.api as sm +from anndata import AnnData +from scipy.sparse import csr_matrix, issparse + +from ..configuration import DKM, DynamoAdataConfig, DynamoAdataKeyManager +from ..dynamo_logger import ( + LoggerManager, + main_debug, + main_info, + main_info_insert_adata_var, + main_warning, +) +from .preprocessor_utils import ( + calc_mean_var_dispersion_sparse, + calc_sz_factor, + get_nan_or_inf_data_bool_mask, + get_svr_filter, + seurat_get_mean_var, +) +from .utils import compute_gene_exp_fraction, cook_dist, merge_adata_attrs + + +def parametric_dispersion_fit( + disp_table: pd.DataFrame, initial_coefs: np.ndarray = np.array([1e-6, 1]) +) -> Tuple[sm.formula.glm, np.ndarray, pd.DataFrame]: + """Perform the dispersion parameter fitting with initial guesses of coefficients. + + This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). + + Args: + disp_table: A pandas dataframe with mu, dispersion for each gene that passes filters. + initial_coefs: Initial parameters for the gamma fit of the dispersion parameters. Defaults to + np.array([1e-6, 1]). + + Returns: + A tuple (fit, coefs, good), where fit is a statsmodels fitting object, coefs contains the two resulting gamma + fitting coefficient, and good is the the subsetted dispersion table that is subjected to Gamma fitting. + """ + + coefs = initial_coefs + iter = 0 + while True: + residuals = disp_table["disp"] / (coefs[0] + coefs[1] / disp_table["mu"]) + good = disp_table.loc[(residuals > initial_coefs[0]) & (residuals < 10000), :] + # https://stats.stackexchange.com/questions/356053/the-identity-link-function-does-not-respect-the-domain-of-the + # -gamma-family + fit = sm.formula.glm( + "disp ~ I(1 / mu)", + data=good, + family=sm.families.Gamma(link=sm.genmod.families.links.identity), + ).train(start_params=coefs) + + oldcoefs = coefs + coefs = fit.params + + if coefs[0] < initial_coefs[0]: + coefs[0] = initial_coefs[0] + if coefs[1] < 0: + main_warning("Parametric dispersion fit may be failed.") + + if np.sum(np.log(coefs / oldcoefs) ** 2 < coefs[0]): + break + iter += 1 + + if iter > 10: + main_warning("Dispersion fit didn't converge") + break + if not np.all(coefs > 0): + main_warning("Parametric dispersion fit may be failed.") + + return fit, coefs, good + + +def disp_calc_helper_NB( + adata: AnnData, layers: str = "X", min_cells_detected: int = 1 +) -> Tuple[List[str], List[pd.DataFrame]]: + """Helper function to calculate the dispersion parameter. + + This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). + + Args: + adata: an Anndata object. + layers: the layer of data used for dispersion fitting. Defaults to "X". + min_cells_detected: the minimal required number of cells with expression for selecting gene for dispersion + fitting. Defaults to 1. + + Returns: + layers: a list of layers available. + res_list: a list of pd.DataFrames with mu, dispersion for each gene that passes filters. + """ + layers = DynamoAdataKeyManager.get_available_layer_keys(adata, layers=layers, include_protein=False) + + res_list = [] + for layer in layers: + if layer == "raw": + CM = adata.raw.X + szfactors = adata.obs[layer + "Size_Factor"][:, None] + elif layer == "X": + CM = adata.X + szfactors = adata.obs["Size_Factor"][:, None] + else: + CM = adata.layers[layer] + szfactors = adata.obs[layer + "Size_Factor"][:, None] + + if issparse(CM): + CM.data = np.round(CM.data, 0) + rounded = CM + else: + rounded = CM.round().astype("int") + + lowerDetectedLimit = adata.uns["lowerDetectedLimit"] if "lowerDetectedLimit" in adata.uns.keys() else 1 + nzGenes = (rounded > lowerDetectedLimit).sum(axis=0) + nzGenes = nzGenes > min_cells_detected + + nzGenes = nzGenes.A1 if issparse(rounded) else nzGenes + if layer.startswith("X_"): + x = rounded[:, nzGenes] + else: + x = ( + rounded[:, nzGenes].multiply(csr_matrix(1 / szfactors)) + if issparse(rounded) + else rounded[:, nzGenes] / szfactors + ) + + xim = np.mean(1 / szfactors) if szfactors is not None else 1 + + f_expression_mean = x.mean(axis=0) + + # For NB: Var(Y) = mu * (1 + mu / k) + # x.A.var(axis=0, ddof=1) + f_expression_var = ( + (x.multiply(x).mean(0).A1 - f_expression_mean.A1**2) * x.shape[0] / (x.shape[0] - 1) + if issparse(x) + else x.var(axis=0, ddof=0) ** 2 + ) # np.mean(np.power(x - f_expression_mean, 2), axis=0) # variance with n - 1 + # https://scialert.net/fulltext/?doi=ajms.2010.1.15 method of moments + disp_guess_meth_moments = f_expression_var - xim * f_expression_mean # variance - mu + + disp_guess_meth_moments = disp_guess_meth_moments / np.power( + f_expression_mean, 2 + ) # this is dispersion parameter (1/k) + + res = pd.DataFrame( + { + "mu": np.array(f_expression_mean).flatten(), + "disp": np.array(disp_guess_meth_moments).flatten(), + } + ) + res.loc[res["mu"] == 0, "mu"] = None + res.loc[res["mu"] == 0, "disp"] = None + res.loc[res["disp"] < 0, "disp"] = 0 + + res["gene_id"] = adata.var_names[nzGenes] + + res_list.append(res) + + return layers, res_list + + +def estimate_dispersion( + adata: AnnData, + layers: str = "X", + modelFormulaStr: str = "~ 1", + min_cells_detected: int = 1, + removeOutliers: bool = False, +) -> AnnData: + """This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). + + Args: + adata: an AnnData object. + layers: the layer(s) to be used for calculating dispersion. Default is "X" if there is no spliced layers. + modelFormulaStr: the model formula used to calculate dispersion parameters. Not used. Defaults to "~ 1". + min_cells_detected: the minimum number of cells detected for calculating the dispersion. Defaults to 1. + removeOutliers: whether to remove outliers when performing dispersion fitting. Defaults to False. + + Raises: + Exception: there is no valid DataFrames with mu for genes. + + Returns: + An updated annData object with dispFitInfo added to uns attribute as a new key. + """ + + logger = LoggerManager.gen_logger("dynamo-preprocessing") + # mu = None + model_terms = [x.strip() for x in re.compile("~|\\*|\\+").split(modelFormulaStr)] + model_terms = list(set(model_terms) - set([""])) + + cds_pdata = adata.obs # .loc[:, model_terms] + cds_pdata["rowname"] = cds_pdata.index.values + layers, disp_tables = disp_calc_helper_NB(adata[:, :], layers, min_cells_detected) + # disp_table['disp'] = np.random.uniform(0, 10, 11) + # disp_table = cds_pdata.apply(disp_calc_helper_NB(adata[:, :], min_cells_detected)) + + # cds_pdata <- dplyr::group_by_(dplyr::select_(rownames_to_column(pData(cds)), "rowname", .dots=model_terms), .dots + # =model_terms) + # disp_table <- as.data.frame(cds_pdata %>% do(disp_calc_helper_NB(cds[,.$rowname], cds@expressionFamily, min_cells_ + # detected))) + for ind in range(len(layers)): + layer, disp_table = layers[ind], disp_tables[ind] + + if disp_table is None: + raise Exception("Parametric dispersion fitting failed, please set a different lowerDetectionLimit") + + disp_table = disp_table.loc[np.where(disp_table["mu"] != np.nan)[0], :] + + res = parametric_dispersion_fit(disp_table) + fit, coefs, good = res[0], res[1], res[2] + + if removeOutliers: + # influence = fit.get_influence().cooks_distance() + # #CD is the distance and p is p-value + # (CD, p) = influence.cooks_distance + + CD = cook_dist(fit, 1 / good["mu"][:, None], good) + cooksCutoff = 4 / good.shape[0] + main_info("Removing " + str(len(CD[CD > cooksCutoff])) + " outliers") + outliers = CD > cooksCutoff + # use CD.index.values? remove genes that lost when doing parameter fitting + lost_gene = set(good.index.values).difference(set(range(len(CD)))) + outliers[lost_gene] = True + res = parametric_dispersion_fit(good.loc[~outliers, :]) + + fit, coefs = res[0], res[1] + + def ans(q): + return coefs[0] + coefs[1] / q + + if layer == "X": + logger.info_insert_adata("dispFitInfo", "uns") + adata.uns["dispFitInfo"] = { + "disp_table": good, + "disp_func": ans, + "coefs": coefs, + } + else: + logger.info_insert_adata(layer + "_dispFitInfo", "uns") + adata.uns[layer + "_dispFitInfo"] = { + "disp_table": good, + "disp_func": ans, + "coefs": coefs, + } + + return adata + + +def top_table(adata: AnnData, layer: str = "X", mode: Literal["dispersion", "gini"] = "dispersion") -> pd.DataFrame: + """Retrieve a table that contains gene names and other info whose dispersions/gini index are highest. + + This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). + + Get information of the top layer. + + Args: + adata: an AnnData object. + layer: the layer(s) that would be searched for. Defaults to "X". + mode: either "dispersion" or "gini", deciding whether dispersion data or gini data would be acquired. Defaults + to "dispersion". + + Raises: + KeyError: if mode is set to dispersion but there is no available dispersion model. + + Returns: + The data frame of the top layer with the gene_id, mean_expression, dispersion_fit and dispersion_empirical as + the columns. + """ + + layer = DynamoAdataKeyManager.get_available_layer_keys(adata, layers=layer, include_protein=False)[0] + + if layer in ["X"]: + key = "dispFitInfo" + else: + key = layer + "_dispFitInfo" + + if mode == "dispersion": + if adata.uns[key] is None: + estimate_dispersion(adata, layers=[layer]) + + if adata.uns[key] is None: + raise KeyError( + "Error: for adata.uns.key=%s, no dispersion model found. Please call estimate_dispersion() before calling this function" + % key + ) + + top_df = pd.DataFrame( + { + "gene_id": adata.uns[key]["disp_table"]["gene_id"], + "mean_expression": adata.uns[key]["disp_table"]["mu"], + "dispersion_fit": adata.uns[key]["disp_func"](adata.uns[key]["disp_table"]["mu"]), + "dispersion_empirical": adata.uns[key]["disp_table"]["disp"], + } + ) + top_df = top_df.set_index("gene_id") + + elif mode == "gini": + top_df = adata.var[layer + "_gini"] + + return top_df + + +def select_genes_monocle( + adata: AnnData, + layer: str = "X", + keep_filtered: bool = True, + n_top_genes: int = 2000, + sort_by: Literal["gini", "cv_dispersion", "fano_dispersion"] = "cv_dispersion", + exprs_frac_for_gene_exclusion: float = 1, + genes_to_exclude: Union[List[str], None] = None, + SVRs_kwargs: dict = {}, +) -> Union[AnnData, np.ndarray]: + """Select genes based on monocle recipe. + + This version is here for modularization of preprocessing, so that users may try combinations of different + preprocessing procedures in Preprocessor. + + Args: + adata: an AnnData object. + layer: The data from a particular layer (include X) used for feature selection. Defaults to "X". + total_szfactor: The column name in the .obs attribute that corresponds to the size factor for the total mRNA. + Defaults to "total_Size_Factor". + keep_filtered: Whether to keep genes that don't pass the filtering in the adata object. Defaults to True. + sort_by: the sorting methods, either SVR, dispersion or Gini index, to be used to select genes. Defaults to + "SVR". + n_top_genes: the number of top genes based on scoring method (specified by sort_by) will be selected as feature + genes. Defaults to 2000. + SVRs_kwargs: kwargs for `SVRs`. Defaults to {}. + only_bools: Only return a vector of bool values. Defaults to False. + exprs_frac_for_gene_exclusion: threshold of fractions for high fraction genes. Defaults to 1. + genes_to_exclude: genes that are excluded from evaluation. Defaults to None. + + Returns: + The adata object with genes updated if `only_bools` is false. Otherwise, the bool array representing selected + genes. + """ + + # The following size factor calculation is a prerequisite for monocle recipe preprocess in preprocessor. + adata = calc_sz_factor( + adata, + total_layers=adata.uns["pp"]["experiment_total_layers"], + scale_to=None, + splicing_total_layers=False, + X_total_layers=False, + layers=adata.uns["pp"]["experiment_layers"], + genes_use_for_norm=None, + ) + + filter_bool = ( + adata.var["pass_basic_filter"] + if "pass_basic_filter" in adata.var.columns + else np.ones(adata.shape[1], dtype=bool) + ) + + if adata.shape[1] <= n_top_genes: + filter_bool = np.ones(adata.shape[1], dtype=bool) + else: + # table = top_table(adata, layer, mode="dispersion") + # valid_table = table.query("dispersion_empirical > dispersion_fit") + # valid_table = valid_table.loc[ + # set(adata.var.index[filter_bool]).intersection(valid_table.index), + # :, + # ] + # gene_id = np.argsort(-valid_table.loc[:, "dispersion_empirical"])[:n_top_genes] + # gene_id = valid_table.iloc[gene_id, :].index + # filter_bool = adata.var.index.isin(gene_id) + if sort_by == "gini": + # table = top_table(adata, layer, mode="gini") + valid_table = adata.var[layer + "_gini"].loc[filter_bool, :] + gene_id = np.argsort(-valid_table.loc[:, "gini"])[:n_top_genes] + gene_id = valid_table.index[gene_id] + filter_bool = gene_id.isin(adata.var.index) + # elif : + # SVRs_args = { + # "min_expr_cells": 0, + # "min_expr_avg": 0, + # "max_expr_avg": np.inf, + # "svr_gamma": None, + # "winsorize": False, + # "winsor_perc": (1, 99.5), + # "sort_inverse": False, + # } + # SVRs_args = update_dict(SVRs_args, SVRs_kwargs) + # adata = SVRs( + # adata, + # layers=[layer], + # total_szfactor=total_szfactor, + # filter_bool=filter_bool, + # **SVRs_args, + # ) + # filter_bool = get_svr_filter(adata, layer=layer, n_top_genes=n_top_genes, return_adata=False) + elif sort_by == "cv_dispersion" or sort_by == "fano_dispersion": + # These parameters are already defined as default values in SVRs function. Do we still need this? + # SVRs_args = { + # "min_expr_cells": 0, + # "min_expr_avg": 0, + # "max_expr_avg": np.inf, + # "svr_gamma": None, + # "winsorize": False, + # "winsor_perc": (1, 99.5), + # "sort_inverse": False, + # } + # SVRs_args = update_dict(SVRs_args, SVRs_kwargs) + adata = select_genes_by_svr( + adata, + layers=[layer], + filter_bool=filter_bool, + algorithm=sort_by, + **SVRs_kwargs, + ) + filter_bool = get_svr_filter(adata, layer=layer, n_top_genes=n_top_genes, return_adata=False) + else: + raise ValueError(f"The algorithm {sort_by} is not existed") + + # filter genes by gene expression fraction as well + adata.var["frac"], invalid_ids = compute_gene_exp_fraction(X=adata.X, threshold=exprs_frac_for_gene_exclusion) + genes_to_exclude = ( + list(adata.var_names[invalid_ids]) + if genes_to_exclude is None + else genes_to_exclude + list(adata.var_names[invalid_ids]) + ) + if genes_to_exclude is not None and len(genes_to_exclude) > 0: + adata_exclude_genes = adata.var.index.intersection(genes_to_exclude) + adata.var.loc[adata_exclude_genes, "use_for_pca"] = False + + if keep_filtered: + adata.var["use_for_pca"] = filter_bool + else: + adata._inplace_subset_var(filter_bool) + adata.var["use_for_pca"] = True + + # return filter_bool if only_bools else adata + + +def select_genes_by_svr( + adata_ori: AnnData, + filter_bool: Union[np.ndarray, None] = None, + layers: str = "X", + algorithm: Literal["cv_dispersion", "fano_dispersion"] = "cv_dispersion", + use_all_genes_cells: bool = False, + **SVRs_kwargs, +) -> AnnData: + """Support Vector Regression to identify highly variable genes. + + This function is modified from https://github.com/velocyto-team/velocyto.py/blob/master/velocyto/analysis.py + + Args: + adata_ori: an AnnData object + filter_bool: A boolean array from the user to select genes for downstream analysis. Defaults to None. + layers: The layer(s) to be used for calculating dispersion score via support vector regression (SVR). Defaults + to "X". + relative_expr: A logic flag to determine whether we need to divide gene expression values first by size factor + before run SVR. Defaults to True. + total_szfactor: The column name in the .obs attribute that corresponds to the size factor for the total mRNA. + Defaults to "total_Size_Factor". + min_expr_cells: minimum number of cells that express the gene for it to be considered in the fit. Defaults to 0. + min_expr_avg: The minimum average of genes across cells required for gene to be selected for SVR analyses. + Defaults to 0. + max_expr_avg: The maximum average of genes across cells required for gene to be selected for SVR analyses. Genes + with average gene expression larger than this value will be treated as house-keeping/outlier genes. Defaults + to np.inf. + svr_gamma: the gamma hyper-parameter of the SVR. Defaults to None. + winsorize: Weather to winsorize the data for the cv vs mean model. Defaults to False. + winsor_perc: the up and lower bound of the winsorization. Defaults to (1, 99.5). + sort_inverse: whether to sort genes from less noisy to more noisy (to use for size estimation not for feature + selection). Defaults to False. + use_all_genes_cells: A logic flag to determine whether all cells and genes should be used for the size factor + calculation. Defaults to False. + + Returns: + An updated annData object with `log_m`, `log_cv`, `score` added to .obs columns and `SVR` added to uns attribute + as a new key. + """ + + layers = DKM.get_available_layer_keys(adata_ori, layers) + winsorize = SVRs_kwargs.get("winsorize", False) + winsor_perc = SVRs_kwargs.get("winsor_perc", (1, 99.5)) + svr_gamma = SVRs_kwargs.pop("svr_gamma", None) + sort_inverse = SVRs_kwargs.pop("sort_inverse", False) + + if use_all_genes_cells: + # let us ignore the `inplace` parameter in pandas.Categorical.remove_unused_categories warning. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + adata = adata_ori[:, filter_bool].copy() if filter_bool is not None else adata_ori + else: + cell_inds = adata_ori.obs.use_for_pca if "use_for_pca" in adata_ori.obs.columns else adata_ori.obs.index + filter_list = ["use_for_pca", "pass_basic_filter"] + filter_checker = [i in adata_ori.var.columns for i in filter_list] + which_filter = np.where(filter_checker)[0] + + gene_inds = adata_ori.var[filter_list[which_filter[0]]] if len(which_filter) > 0 else adata_ori.var.index + + # let us ignore the `inplace` parameter in pandas.Categorical.remove_unused_categories warning. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + adata = adata_ori[cell_inds, gene_inds].copy() + + for layer in layers: + valid_CM, detected_bool = get_vaild_CM(adata, layer, **SVRs_kwargs) + if valid_CM is None: + continue + + ground, target, mean = get_ground_target(algorithm, valid_CM, winsorize, winsor_perc) + fitted_fun = get_prediction_by_svr(ground, target, mean, svr_gamma) + score = target - fitted_fun(ground) + if sort_inverse: + score = -score + + # Now we can get "SVR" from get_prediction_by_svr + # key = "velocyto_SVR" if layer == "raw" or layer == "X" else layer + "_velocyto_SVR" + # adata_ori.uns[key] = {"SVR": fitted_fun} + + prefix = "" if layer == "X" else layer + "_" + (adata.var[prefix + "log_m"], adata.var[prefix + "log_cv"], adata.var[prefix + "score"],) = ( + np.nan, + np.nan, + -np.inf, + ) + ( + adata.var.loc[detected_bool, prefix + "log_m"], + adata.var.loc[detected_bool, prefix + "log_cv"], + adata.var.loc[detected_bool, prefix + "score"], + ) = ( + np.array(ground).flatten(), + np.array(target).flatten(), + np.array(score).flatten(), + ) + + adata_ori = merge_adata_attrs(adata_ori, adata, attr="var") + + return adata_ori + + +def get_vaild_CM( + adata: AnnData, + layer: str = "X", + relative_expr: bool = True, + total_szfactor: str = "total_Size_Factor", + min_expr_cells: int = 0, + min_expr_avg: int = 0, + max_expr_avg: int = np.inf, + winsorize: bool = False, + winsor_perc: Tuple[float, float] = (1, 99.5), +): + CM = None + if layer == "raw": + CM = adata.X.copy() if adata.raw is None else adata.raw + szfactors = ( + adata.obs[layer + "_Size_Factor"].values[:, None] + if adata.raw.X is not None + else adata.obs["Size_Factor"].values[:, None] + ) + elif layer == "X": + CM = adata.X.copy() + szfactors = adata.obs["Size_Factor"].values[:, None] + elif layer == "protein": + if "protein" in adata.obsm_keys(): + CM = adata.obsm["protein"].copy() + szfactors = adata.obs[layer + "_Size_Factor"].values[:, None] + else: + CM = adata.layers[layer].copy() + szfactors = ( + adata.obs[layer + "_Size_Factor"].values[:, None] if layer + "_Size_Factor" in adata.obs.columns else None + ) + + if total_szfactor is not None and total_szfactor in adata.obs.keys(): + szfactors = adata.obs[total_szfactor].values[:, None] if total_szfactor in adata.obs.columns else None + + if szfactors is not None and relative_expr: + if issparse(CM): + from sklearn.utils import sparsefuncs + + sparsefuncs.inplace_row_scale(CM, 1 / szfactors) + else: + CM /= szfactors + + if winsorize: + if min_expr_cells <= ((100 - winsor_perc[1]) * CM.shape[0] * 0.01): + min_expr_cells = int(np.ceil((100 - winsor_perc[1]) * CM.shape[1] * 0.01)) + 2 + + detected_bool = np.array( + ((CM > 0).sum(0) >= min_expr_cells) & (CM.mean(0) <= max_expr_avg) & (CM.mean(0) >= min_expr_avg) + ).flatten() + + return CM[:, detected_bool], detected_bool + + +def get_ground_target(algorithm, valid_CM, winsorize, winsor_perc) -> AnnData: + if algorithm == "fano_dispersion": + (gene_counts_stats, gene_fano_parameters) = get_highvar_genes_sparse(valid_CM) + ground = np.array(gene_counts_stats["mean"]).flatten()[:, None] + target = np.array(gene_counts_stats["fano"]).flatten() + mu = gene_counts_stats["mean"] + elif algorithm == "cv_dispersion": + if winsorize: + down, up = ( + np.percentile(valid_CM.A, winsor_perc, 0) + if issparse(valid_CM) + else np.percentile(valid_CM, winsor_perc, 0) + ) + Sfw = ( + np.clip(valid_CM.A, down[None, :], up[None, :]) + if issparse(valid_CM) + else np.percentile(valid_CM, winsor_perc, 0) + ) + mu = Sfw.mean(0) + sigma = Sfw.std(0, ddof=1) + else: + mu = np.array(valid_CM.mean(0)).flatten() + sigma = ( + np.array( + np.sqrt( + (valid_CM.multiply(valid_CM).mean(0).A1 - (mu) ** 2) + # * (adata.n_obs) + # / (adata.n_obs - 1) + ) + ) + if issparse(valid_CM) + else valid_CM.std(0, ddof=1) + ) + + cv = sigma / mu + log_m = np.array(np.log2(mu)).flatten() + log_cv = np.array(np.log2(cv)).flatten() + log_m[mu == 0], log_cv[mu == 0] = 0, 0 + ground = log_m[:, None] + target = log_cv + else: + raise ValueError(f"The algorithm {algorithm} is not existed") + + return ground, target, mu + + +def get_prediction_by_svr(ground, target, mean, svr_gamma): + from sklearn.svm import SVR + + if svr_gamma is None: + svr_gamma = 150.0 / len(mean) + + # Fit the Support Vector Regression + clf = SVR(gamma=svr_gamma) + clf.fit(ground, target) + return clf.predict + + +# Highly variable gene selection function: +def get_highvar_genes_sparse( + expression: Union[ + np.ndarray, + scipy.sparse.csr_matrix, + scipy.sparse.csc_matrix, + scipy.sparse.coo_matrix, + ], + expected_fano_threshold: Optional[float] = None, + numgenes: Optional[int] = None, + minimal_mean: float = 0.5, +) -> Tuple[pd.DataFrame, Dict]: + """Find highly-variable genes in sparse single-cell data matrices. + + Args: + expression: Gene expression matrix + expected_fano_threshold: Optionally can be used to set a manual dispersion threshold (for definition of + "highly-variable") + numgenes: Optionally can be used to find the n most variable genes + minimal_mean: Sets a threshold on the minimum mean expression to consider + + Returns: + gene_counts_stats: Results dataframe containing pertinent information for each gene + gene_fano_parameters: Additional informative dictionary (w/ records of dispersion for each gene, threshold, + etc.) + """ + gene_mean = np.array(expression.mean(axis=0)).astype(float).reshape(-1) + E2 = expression.copy() + E2.data **= 2 + gene2_mean = np.array(E2.mean(axis=0)).reshape(-1) + gene_var = pd.Series(gene2_mean - (gene_mean**2)) + del E2 + gene_mean = pd.Series(gene_mean) + gene_fano = gene_var / gene_mean + + # Find parameters for expected fano line -- this line can be non-linear... + top_genes = gene_mean.sort_values(ascending=False)[:20].index + A = (np.sqrt(gene_var) / gene_mean)[top_genes].min() + + w_mean_low, w_mean_high = gene_mean.quantile([0.10, 0.90]) + w_fano_low, w_fano_high = gene_fano.quantile([0.10, 0.90]) + winsor_box = ( + (gene_fano > w_fano_low) & (gene_fano < w_fano_high) & (gene_mean > w_mean_low) & (gene_mean < w_mean_high) + ) + fano_median = gene_fano[winsor_box].median() + B = np.sqrt(fano_median) + + gene_expected_fano = (A**2) * gene_mean + (B**2) + fano_ratio = gene_fano / gene_expected_fano + + # Identify high var genes + if numgenes is not None: + highvargenes = fano_ratio.sort_values(ascending=False).index[:numgenes] + high_var_genes_ind = fano_ratio.index.isin(highvargenes) + T = None + else: + if not expected_fano_threshold: + T = 1.0 + gene_fano[winsor_box].std() + else: + T = expected_fano_threshold + + high_var_genes_ind = (fano_ratio > T) & (gene_mean > minimal_mean) + + gene_counts_stats = pd.DataFrame( + { + "mean": gene_mean, + "var": gene_var, + "fano": gene_fano, + "expected_fano": gene_expected_fano, + "high_var": high_var_genes_ind, + "fano_ratio": fano_ratio, + } + ) + gene_fano_parameters = { + "A": A, + "B": B, + "T": T, + "minimal_mean": minimal_mean, + } + return (gene_counts_stats, gene_fano_parameters) + + +def select_genes_by_seurat_recipe( + adata: AnnData, + layer: str = DKM.X_LAYER, + nan_replace_val: Union[float, None] = None, + n_top_genes: int = 2000, + algorithm: Literal["seurat_dispersion", "fano_dispersion"] = "seurat_dispersion", + seurat_min_disp: Union[float, None] = None, + seurat_max_disp: Union[float, None] = None, + seurat_min_mean: Union[float, None] = None, + seurat_max_mean: Union[float, None] = None, + gene_names: Union[List[str], None] = None, + var_filter_key: str = "pass_basic_filter", + inplace: bool = False, +) -> None: + """A general function for feature genes selection. + + Preprocess adata and dispatch to different filtering methods, and eventually set keys in anndata to denote which + genes are wanted in downstream analysis. + + Args: + adata: an AnnData object. + layer: the key of a sparse matrix in adata. Defaults to DKM.X_LAYER. + nan_replace_val: your choice of value to replace values in layer. Defaults to None. + n_top_genes: number of genes to select as highly variable genes. Defaults to 2000. + algorithm: a method for selecting genes; must be one of "seurat_dispersion" or "fano". + seurat_min_disp: seurat dispersion min cutoff. Defaults to None. + seurat_max_disp: seurat dispersion max cutoff. Defaults to None. + seurat_min_mean: seurat mean min cutoff. Defaults to None. + seurat_max_mean: seurat mean max cutoff. Defaults to None. + gene_names: name of genes to be selected. Defaults to None. + var_filter_key: filter gene names based on the key defined in adata.var before gene selection. Defaults to + "pass_basic_filter". + inplace: when inplace is True, subset adata according to selected genes. Defaults to False. + + Raises: + NotImplementedError: the recipe is invalid/unsupported. + """ + + pass_filter_genes = adata.var_names + if gene_names: + main_info("select genes on gene names from arguments ") + pass_filter_genes = gene_names + elif var_filter_key: + main_info("select genes on var key: %s" % (var_filter_key)) + pass_filter_genes = adata.var_names[adata.var[var_filter_key]] + + if len(pass_filter_genes) != len(set(pass_filter_genes)): + main_warning("gene names are not unique, please check your preprocessing procedure.") + subset_adata = adata[:, pass_filter_genes] + if n_top_genes is None: + main_info("n_top_genes is None, reserve all genes and add filter gene information") + n_top_genes = adata.n_vars + layer_mat = DKM.select_layer_data(subset_adata, layer) + if nan_replace_val: + main_info("replacing nan values with: %s" % (nan_replace_val)) + _mask = get_nan_or_inf_data_bool_mask(layer_mat) + layer_mat[_mask] = nan_replace_val + + if algorithm == "seurat_dispersion": + mean, variance, highly_variable_mask = select_genes_by_seurat_dispersion( + subset_adata, + layer_mat, + min_disp=seurat_min_disp, + max_disp=seurat_max_disp, + min_mean=seurat_min_mean, + max_mean=seurat_max_mean, + n_top_genes=n_top_genes, + ) + main_info_insert_adata_var(DKM.VAR_GENE_MEAN_KEY) + main_info_insert_adata_var(DKM.VAR_GENE_VAR_KEY) + main_info_insert_adata_var(DKM.VAR_GENE_HIGHLY_VARIABLE_KEY) + main_debug("type of variance:" + str(type(variance))) + main_debug("shape of variance:" + str(variance.shape)) + adata.var[DKM.VAR_GENE_MEAN_KEY] = np.nan + adata.var[DKM.VAR_GENE_VAR_KEY] = np.nan + adata.var[DKM.VAR_GENE_HIGHLY_VARIABLE_KEY] = False + adata.var[DKM.VAR_USE_FOR_PCA] = False + + adata.var[DKM.VAR_GENE_MEAN_KEY][pass_filter_genes] = mean.flatten() + adata.var[DKM.VAR_GENE_VAR_KEY][pass_filter_genes] = variance + adata.var[DKM.VAR_GENE_HIGHLY_VARIABLE_KEY][pass_filter_genes] = highly_variable_mask + adata.var[DKM.VAR_USE_FOR_PCA][pass_filter_genes] = highly_variable_mask + + elif algorithm == "fano_dispersion": + select_genes_monocle(adata, layer=layer, sort_by=algorithm) + # adata = select_genes_by_svr( + # adata, + # layers=layer, + # algorithm=algorithm, + # ) + # filter_bool = get_svr_filter(adata, layer=layer, n_top_genes=n_top_genes, return_adata=False) + else: + raise ValueError(f"The algorithm {algorithm} is not existed") + + main_info("number of selected highly variable genes: " + str(adata.var[DKM.VAR_USE_FOR_PCA].sum())) + if inplace: + main_info("inplace is True, subset adata according to selected genes.") + adata = adata[:, adata.var[DKM.VAR_USE_FOR_PCA]] + + +def select_genes_by_seurat_dispersion( + adata: AnnData, + sparse_layer_mat: csr_matrix, + n_bins: int = 20, + log_mean_and_dispersion: bool = True, + min_disp: float = None, + max_disp: float = None, + min_mean: float = None, + max_mean: float = None, + n_top_genes: Union[int, None] = None, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, None]: + """Apply seurat's gene selection recipe by cutoffs. + + Args: + adata: an AnnData object + sparse_layer_mat: the sparse matrix used for gene selection. + n_bins: the number of bins for normalization. Defaults to 20. + log_mean_and_dispersion: whether log the gene expression values before calculating the dispersion values. + Defaults to True. + min_disp: seurat dispersion min cutoff. Defaults to None. + max_disp: seurat dispersion max cutoff. Defaults to None. + min_mean: seurat mean min cutoff. Defaults to None. + max_mean: seurat mean max cutoff. Defaults to None. + n_top_genes: number of top genes to be evaluated. If set to be None, genes are filtered by mean and dispersion + norm threshold. Defaults to None. + + Returns: + A tuple (mean, variance, highly_variable_mask, highly_variable_scores), where mean is the mean of the provided + sparse matrix, variance is the variance of the provided sparse matrix, highly_variable_mask is a bool array + indicating whether an element (a gene) is highly variable in the matrix. highly_variable_scores is always none + since the scores are not applicable to Seurat recipe. + """ + + # default values from Seurat + if min_disp is None: + min_disp = 0.5 + if max_disp is None: + max_disp = np.inf + if min_mean is None: + min_mean = 0.0125 + if max_mean is None: + max_mean = 3 + + # mean, variance, dispersion = calc_mean_var_dispersion_sparse(sparse_layer_mat) # Dead + sc_mean, sc_var = seurat_get_mean_var(sparse_layer_mat) + mean, variance = sc_mean, sc_var + dispersion = variance / mean + + if log_mean_and_dispersion: + mean = np.log1p(mean) + dispersion[np.equal(dispersion, 0)] = np.nan + dispersion = np.log(dispersion) + + temp_df = pd.DataFrame() + temp_df["mean"], temp_df["dispersion"] = mean, dispersion + + temp_df["mean_bin"] = pd.cut(temp_df["mean"], bins=n_bins) + disp_grouped = temp_df.groupby("mean_bin")["dispersion"] + disp_mean_bin = disp_grouped.mean() + disp_std_bin = disp_grouped.std(ddof=1) + + # handle nan std + one_gene_per_bin = disp_std_bin.isnull() + + disp_std_bin[one_gene_per_bin] = disp_mean_bin[one_gene_per_bin].values + disp_mean_bin[one_gene_per_bin] = 0 + + # normalized dispersion + mean = disp_mean_bin[temp_df["mean_bin"].values].values + std = disp_std_bin[temp_df["mean_bin"].values].values + variance = std**2 + temp_df["dispersion_norm"] = ((temp_df["dispersion"] - mean) / std).fillna(0) + dispersion_norm = temp_df["dispersion_norm"].values + + highly_variable_mask = None + if n_top_genes is not None: + main_info("choose %d top genes" % (n_top_genes), indent_level=2) + threshold = temp_df["dispersion_norm"].nlargest(n_top_genes).values[-1] + highly_variable_mask = temp_df["dispersion_norm"].values >= threshold + else: + main_info("choose genes by mean and dispersion norm threshold", indent_level=2) + highly_variable_mask = np.logical_and.reduce( + ( + mean > min_mean, + mean < max_mean, + dispersion_norm > min_disp, + dispersion_norm < max_disp, + ) + ) + + return mean, variance, highly_variable_mask + + +def get_highly_variable_mask_by_dispersion_svr( + mean: np.ndarray, + var: np.ndarray, + n_top_genes: int, + svr_gamma: Union[float, None] = None, + return_scores: bool = True, +) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray]: + """Returns the mask with shape same as mean and var. + + The mask indicates whether each index is highly variable or not. Each index should represent a gene. + + Args: + mean: mean of the genes. + var: variance of the genes. + n_top_genes: the number of top genes to be inspected. + svr_gamma: coefficient for support vector regression. Defaults to None. + return_scores: whether returen the dispersion scores. Defaults to True. + + Returns: + A tuple (highly_variable_mask, scores) where highly_variable_mask is a bool array indicating whether an element + (a gene) is highly variable in the matrix and scores is an array recording variable score for each gene. scores + would only be returned when `return_scores` is True. + """ + + # normally, select svr_gamma based on #features + if svr_gamma is None: + svr_gamma = 150.0 / len(mean) + from sklearn.svm import SVR + + mean_log = np.log2(mean) + cv_log = np.log2(np.sqrt(var) / mean) + classifier = SVR(gamma=svr_gamma) + # fit & prediction will complain about nan values if not take cared here + is_nan_indices = np.logical_or(np.isnan(mean_log), np.isnan(cv_log)) + if np.sum(is_nan_indices) > 0: + main_warning( + ( + "mean and cv_log contain NAN values. We exclude them in SVR training. Please use related gene filtering " + "methods to filter genes with zero means." + ) + ) + + classifier.fit(mean_log[~is_nan_indices, np.newaxis], cv_log.reshape([-1, 1])[~is_nan_indices]) + scores = np.repeat(np.nan, len(mean_log)) + # TODO handle nan values during prediction here + scores[~is_nan_indices] = cv_log[~is_nan_indices] - classifier.predict(mean_log[~is_nan_indices, np.newaxis]) + scores = scores.reshape([-1, 1]) # shape should be #genes x 1 + + # score threshold based on n top genes + n_top_genes = min(n_top_genes, len(mean)) # maybe not enough genes there + score_threshold = np.sort(-scores)[n_top_genes - 1] + highly_variable_mask = scores >= score_threshold + highly_variable_mask = np.array(highly_variable_mask).flatten() + if return_scores: + return highly_variable_mask, scores + return highly_variable_mask diff --git a/dynamo/preprocessing/preprocess.py b/dynamo/preprocessing/preprocess.py index 55b73ecff..9c6cad63e 100755 --- a/dynamo/preprocessing/preprocess.py +++ b/dynamo/preprocessing/preprocess.py @@ -13,7 +13,6 @@ from anndata import AnnData from scipy.sparse import csr_matrix, issparse from sklearn.decomposition import FastICA -from sklearn.utils import sparsefuncs from ..configuration import DKM, DynamoAdataConfig, DynamoAdataKeyManager from ..dynamo_logger import ( @@ -27,14 +26,12 @@ from ..tools.utils import update_dict from ..utils import copy_adata from .cell_cycle import cell_cycle_scores -from .preprocess_monocle_utils import top_table +from .gene_selection import select_genes_by_svr, top_table from .preprocessor_utils import ( - SVRs, _infer_labeling_experiment_type, filter_cells_by_outliers, filter_genes_by_outliers, normalize_cell_expr_by_size_factors, - select_genes_monocle, ) from .utils import ( Freeman_Tukey, @@ -348,17 +345,36 @@ def Gini(adata: anndata.AnnData, layers: Union[Literal["all"], List[str]] = "all cur_cm = CM[:, i].A if issparse(CM) else CM[:, i] if np.amin(CM) < 0: cur_cm -= np.amin(cur_cm) # values cannot be negative - cur_cm += 0.0000001 # np.min(array[array!=0]) #values cannot be 0 + cur_cm = cur_cm.astype(float) + 0.0000001 # np.min(array[array!=0]) #values cannot be 0 cur_cm = np.sort(cur_cm, axis=0) # values must be sorted # index per array element index = np.arange(1, cur_cm.shape[0] + 1) n = cur_cm.shape[0] # number of array elements - gini[i] = (np.sum((2 * index - n - 1) * cur_cm)) / (n * np.sum(cur_cm)) # Gini coefficient - - if layer in ["raw", "X"]: - adata.var["gini"] = gini - else: - adata.var[layer + "_gini"] = gini + aa = np.sum((2 * index - n - 1) * cur_cm) + bb = n * np.sum(cur_cm) + # gini[i] = (np.sum((2 * index - n - 1) * cur_cm)) / (n * np.sum(cur_cm)) # Gini coefficient + gini[i] = aa / bb + print(i, aa, bb, gini[i]) + + # # all values are treated equally, arrays must be 1d + # cur_cm = CM.toarray() if issparse(CM) else CM.copy() + # cur_cm[cur_cm < 0] = 0 # values cannot be negative + # cur_cm = cur_cm.astype(float) + 0.0000001 # values cannot be 0 + # cur_cm = np.sort(cur_cm, axis=0) # values must be sorted + # # index per array element + # index = np.arange(1, cur_cm.shape[0] + 1) + # n = cur_cm.shape[0] # number of array elements + # ccc = (2 * index - n - 1)[:, np.newaxis] * cur_cm + # aa = np.sum((2 * index - n - 1)[:, np.newaxis] * cur_cm, axis=0) + # bb = n * np.sum(cur_cm, axis=0) + # gini = aa / bb + # print(i, aa, bb, ccc, gini[i]) + # #gini = (np.sum((2 * index - n - 1) * cur_cm, axis=0)) / (n * np.sum(cur_cm, axis=0)) # Gini coefficient + # + # if layer in ["raw", "X"]: + # adata.var["gini"] = gini + # else: + # adata.var[layer + "_gini"] = gini return adata @@ -868,6 +884,7 @@ def recipe_monocle( X and reduced dimensions, etc., are updated. Otherwise, return None. """ + main_warning(__name__ + " is deprecated.") logger = LoggerManager.gen_logger("dynamo-preprocessing") logger.log_time() keep_filtered_cells = DynamoAdataConfig.use_default_var_if_none( @@ -1398,7 +1415,7 @@ def recipe_velocyto( adata = adata[:, filter_bool] - adata = SVRs( + adata = get_highly_variable_genes_by_svr( adata, layers=["spliced"], min_expr_cells=2, @@ -1641,7 +1658,7 @@ def select_genes_monocle_legacy( "sort_inverse": False, } SVRs_args = update_dict(SVRs_args, SVRs_kwargs) - adata = SVRs( + adata = get_highly_variable_genes_by_svr( adata, layers=[layer], total_szfactor=total_szfactor, diff --git a/dynamo/preprocessing/preprocess_monocle_utils.py b/dynamo/preprocessing/preprocess_monocle_utils.py index b4c0fe981..e69de29bb 100644 --- a/dynamo/preprocessing/preprocess_monocle_utils.py +++ b/dynamo/preprocessing/preprocess_monocle_utils.py @@ -1,302 +0,0 @@ -import re -from typing import List, Tuple - -try: - from typing import Literal -except ImportError: - from typing_extensions import Literal - -import anndata -import numpy as np -import pandas as pd -import statsmodels.api as sm -from scipy.sparse import csr_matrix, issparse - -from ..configuration import DKM, DynamoAdataConfig, DynamoAdataKeyManager -from ..dynamo_logger import ( - LoggerManager, - main_critical, - main_info, - main_info_insert_adata_obsm, - main_warning, -) -from .utils import cook_dist - - -def parametric_dispersion_fit( - disp_table: pd.DataFrame, initial_coefs: np.ndarray = np.array([1e-6, 1]) -) -> Tuple[sm.formula.glm, np.ndarray, pd.DataFrame]: - """Perform the dispersion parameter fitting with initial guesses of coefficients. - - This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). - - Args: - disp_table: A pandas dataframe with mu, dispersion for each gene that passes filters. - initial_coefs: Initial parameters for the gamma fit of the dispersion parameters. Defaults to - np.array([1e-6, 1]). - - Returns: - A tuple (fit, coefs, good), where fit is a statsmodels fitting object, coefs contains the two resulting gamma - fitting coefficient, and good is the the subsetted dispersion table that is subjected to Gamma fitting. - """ - - coefs = initial_coefs - iter = 0 - while True: - residuals = disp_table["disp"] / (coefs[0] + coefs[1] / disp_table["mu"]) - good = disp_table.loc[(residuals > initial_coefs[0]) & (residuals < 10000), :] - # https://stats.stackexchange.com/questions/356053/the-identity-link-function-does-not-respect-the-domain-of-the - # -gamma-family - fit = sm.formula.glm( - "disp ~ I(1 / mu)", - data=good, - family=sm.families.Gamma(link=sm.genmod.families.links.identity), - ).train(start_params=coefs) - - oldcoefs = coefs - coefs = fit.params - - if coefs[0] < initial_coefs[0]: - coefs[0] = initial_coefs[0] - if coefs[1] < 0: - main_warning("Parametric dispersion fit may be failed.") - - if np.sum(np.log(coefs / oldcoefs) ** 2 < coefs[0]): - break - iter += 1 - - if iter > 10: - main_warning("Dispersion fit didn't converge") - break - if not np.all(coefs > 0): - main_warning("Parametric dispersion fit may be failed.") - - return fit, coefs, good - - -def disp_calc_helper_NB( - adata: anndata.AnnData, layers: str = "X", min_cells_detected: int = 1 -) -> Tuple[List[str], List[pd.DataFrame]]: - """Helper function to calculate the dispersion parameter. - - This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). - - Args: - adata: an Anndata object. - layers: the layer of data used for dispersion fitting. Defaults to "X". - min_cells_detected: the minimal required number of cells with expression for selecting gene for dispersion - fitting. Defaults to 1. - - Returns: - layers: a list of layers available. - res_list: a list of pd.DataFrames with mu, dispersion for each gene that passes filters. - """ - layers = DynamoAdataKeyManager.get_available_layer_keys(adata, layers=layers, include_protein=False) - - res_list = [] - for layer in layers: - if layer == "raw": - CM = adata.raw.X - szfactors = adata.obs[layer + "Size_Factor"][:, None] - elif layer == "X": - CM = adata.X - szfactors = adata.obs["Size_Factor"][:, None] - else: - CM = adata.layers[layer] - szfactors = adata.obs[layer + "Size_Factor"][:, None] - - if issparse(CM): - CM.data = np.round(CM.data, 0) - rounded = CM - else: - rounded = CM.round().astype("int") - - lowerDetectedLimit = adata.uns["lowerDetectedLimit"] if "lowerDetectedLimit" in adata.uns.keys() else 1 - nzGenes = (rounded > lowerDetectedLimit).sum(axis=0) - nzGenes = nzGenes > min_cells_detected - - nzGenes = nzGenes.A1 if issparse(rounded) else nzGenes - if layer.startswith("X_"): - x = rounded[:, nzGenes] - else: - x = ( - rounded[:, nzGenes].multiply(csr_matrix(1 / szfactors)) - if issparse(rounded) - else rounded[:, nzGenes] / szfactors - ) - - xim = np.mean(1 / szfactors) if szfactors is not None else 1 - - f_expression_mean = x.mean(axis=0) - - # For NB: Var(Y) = mu * (1 + mu / k) - # x.A.var(axis=0, ddof=1) - f_expression_var = ( - (x.multiply(x).mean(0).A1 - f_expression_mean.A1**2) * x.shape[0] / (x.shape[0] - 1) - if issparse(x) - else x.var(axis=0, ddof=0) ** 2 - ) # np.mean(np.power(x - f_expression_mean, 2), axis=0) # variance with n - 1 - # https://scialert.net/fulltext/?doi=ajms.2010.1.15 method of moments - disp_guess_meth_moments = f_expression_var - xim * f_expression_mean # variance - mu - - disp_guess_meth_moments = disp_guess_meth_moments / np.power( - f_expression_mean, 2 - ) # this is dispersion parameter (1/k) - - res = pd.DataFrame( - { - "mu": np.array(f_expression_mean).flatten(), - "disp": np.array(disp_guess_meth_moments).flatten(), - } - ) - res.loc[res["mu"] == 0, "mu"] = None - res.loc[res["mu"] == 0, "disp"] = None - res.loc[res["disp"] < 0, "disp"] = 0 - - res["gene_id"] = adata.var_names[nzGenes] - - res_list.append(res) - - return layers, res_list - - -def estimate_dispersion( - adata: anndata.AnnData, - layers: str = "X", - modelFormulaStr: str = "~ 1", - min_cells_detected: int = 1, - removeOutliers: bool = False, -) -> anndata.AnnData: - """This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). - - Args: - adata: an AnnData object. - layers: the layer(s) to be used for calculating dispersion. Default is "X" if there is no spliced layers. - modelFormulaStr: the model formula used to calculate dispersion parameters. Not used. Defaults to "~ 1". - min_cells_detected: the minimum number of cells detected for calculating the dispersion. Defaults to 1. - removeOutliers: whether to remove outliers when performing dispersion fitting. Defaults to False. - - Raises: - Exception: there is no valid DataFrames with mu for genes. - - Returns: - An updated annData object with dispFitInfo added to uns attribute as a new key. - """ - - logger = LoggerManager.gen_logger("dynamo-preprocessing") - # mu = None - model_terms = [x.strip() for x in re.compile("~|\\*|\\+").split(modelFormulaStr)] - model_terms = list(set(model_terms) - set([""])) - - cds_pdata = adata.obs # .loc[:, model_terms] - cds_pdata["rowname"] = cds_pdata.index.values - layers, disp_tables = disp_calc_helper_NB(adata[:, :], layers, min_cells_detected) - # disp_table['disp'] = np.random.uniform(0, 10, 11) - # disp_table = cds_pdata.apply(disp_calc_helper_NB(adata[:, :], min_cells_detected)) - - # cds_pdata <- dplyr::group_by_(dplyr::select_(rownames_to_column(pData(cds)), "rowname", .dots=model_terms), .dots - # =model_terms) - # disp_table <- as.data.frame(cds_pdata %>% do(disp_calc_helper_NB(cds[,.$rowname], cds@expressionFamily, min_cells_ - # detected))) - for ind in range(len(layers)): - layer, disp_table = layers[ind], disp_tables[ind] - - if disp_table is None: - raise Exception("Parametric dispersion fitting failed, please set a different lowerDetectionLimit") - - disp_table = disp_table.loc[np.where(disp_table["mu"] != np.nan)[0], :] - - res = parametric_dispersion_fit(disp_table) - fit, coefs, good = res[0], res[1], res[2] - - if removeOutliers: - # influence = fit.get_influence().cooks_distance() - # #CD is the distance and p is p-value - # (CD, p) = influence.cooks_distance - - CD = cook_dist(fit, 1 / good["mu"][:, None], good) - cooksCutoff = 4 / good.shape[0] - main_info("Removing " + str(len(CD[CD > cooksCutoff])) + " outliers") - outliers = CD > cooksCutoff - # use CD.index.values? remove genes that lost when doing parameter fitting - lost_gene = set(good.index.values).difference(set(range(len(CD)))) - outliers[lost_gene] = True - res = parametric_dispersion_fit(good.loc[~outliers, :]) - - fit, coefs = res[0], res[1] - - def ans(q): - return coefs[0] + coefs[1] / q - - if layer == "X": - logger.info_insert_adata("dispFitInfo", "uns") - adata.uns["dispFitInfo"] = { - "disp_table": good, - "disp_func": ans, - "coefs": coefs, - } - else: - logger.info_insert_adata(layer + "_dispFitInfo", "uns") - adata.uns[layer + "_dispFitInfo"] = { - "disp_table": good, - "disp_func": ans, - "coefs": coefs, - } - - return adata - - -def top_table( - adata: anndata.AnnData, layer: str = "X", mode: Literal["dispersion", "gini"] = "dispersion" -) -> pd.DataFrame: - """Retrieve a table that contains gene names and other info whose dispersions/gini index are highest. - - This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). - - Get information of the top layer. - - Args: - adata: an AnnData object. - layer: the layer(s) that would be searched for. Defaults to "X". - mode: either "dispersion" or "gini", deciding whether dispersion data or gini data would be acquired. Defaults - to "dispersion". - - Raises: - KeyError: if mode is set to dispersion but there is no available dispersion model. - - Returns: - The data frame of the top layer with the gene_id, mean_expression, dispersion_fit and dispersion_empirical as - the columns. - """ - - layer = DynamoAdataKeyManager.get_available_layer_keys(adata, layers=layer, include_protein=False)[0] - - if layer in ["X"]: - key = "dispFitInfo" - else: - key = layer + "_dispFitInfo" - - if mode == "dispersion": - if adata.uns[key] is None: - estimate_dispersion(adata, layers=[layer]) - - if adata.uns[key] is None: - raise KeyError( - "Error: for adata.uns.key=%s, no dispersion model found. Please call estimate_dispersion() before calling this function" - % key - ) - - top_df = pd.DataFrame( - { - "gene_id": adata.uns[key]["disp_table"]["gene_id"], - "mean_expression": adata.uns[key]["disp_table"]["mu"], - "dispersion_fit": adata.uns[key]["disp_func"](adata.uns[key]["disp_table"]["mu"]), - "dispersion_empirical": adata.uns[key]["disp_table"]["disp"], - } - ) - top_df = top_df.set_index("gene_id") - - elif mode == "gini": - top_df = adata.var[layer + "_gini"] - - return top_df diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index 62ce83b31..9398c8787 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -13,7 +13,6 @@ from anndata import AnnData from scipy.sparse import csr_matrix, spmatrix from scipy.sparse.base import issparse -from sklearn.svm import SVR from sklearn.utils import sparsefuncs from ..configuration import DKM, DynamoAdataKeyManager @@ -31,7 +30,6 @@ ) from ..tools.utils import update_dict from ..utils import copy_adata -from .preprocess_monocle_utils import top_table from .utils import ( Freeman_Tukey, add_noise_to_duplicates, @@ -253,688 +251,6 @@ def seurat_get_mean_var( return mean, var -def select_genes_by_dispersion_general( - adata: AnnData, - layer: str = DKM.X_LAYER, - nan_replace_val: Union[float, None] = None, - n_top_genes: int = 2000, - recipe: Literal["monocle", "svr", "seurat"] = "monocle", - seurat_min_disp: Union[float, None] = None, - seurat_max_disp: Union[float, None] = None, - seurat_min_mean: Union[float, None] = None, - seurat_max_mean: Union[float, None] = None, - monocle_kwargs: dict = {}, - gene_names: Union[List[str], None] = None, - var_filter_key: str = "pass_basic_filter", - inplace: bool = False, -) -> None: - """A general function for feature genes selection. - - Preprocess adata and dispatch to different filtering methods, and eventually set keys in anndata to denote which - genes are wanted in downstream analysis. - - Args: - adata: an AnnData object. - layer: the key of a sparse matrix in adata. Defaults to DKM.X_LAYER. - nan_replace_val: your choice of value to replace values in layer. Defaults to None. - n_top_genes: number of genes to select as highly variable genes. Defaults to 2000. - recipe: a recipe for selecting genes; must be one of "monocle", "svr", or "seurat". Defaults to "monocle". - seurat_min_disp: seurat dispersion min cutoff. Defaults to None. - seurat_max_disp: seurat dispersion max cutoff. Defaults to None. - seurat_min_mean: seurat mean min cutoff. Defaults to None. - seurat_max_mean: seurat mean max cutoff. Defaults to None. - monocle_kwargs: kwargs for `select_genes_monocle`. Defaults to {}. - gene_names: name of genes to be selected. Defaults to None. - var_filter_key: filter gene names based on the key defined in adata.var before gene selection. Defaults to - "pass_basic_filter". - inplace: when inplace is True, subset adata according to selected genes. Defaults to False. - - Raises: - NotImplementedError: the recipe is invalid/unsupported. - """ - - main_info("filtering genes by dispersion...") - main_log_time() - - pass_filter_genes = adata.var_names - if gene_names: - main_info("select genes on gene names from arguments ") - pass_filter_genes = gene_names - elif var_filter_key: - main_info("select genes on var key: %s" % (var_filter_key)) - pass_filter_genes = adata.var_names[adata.var[var_filter_key]] - - if len(pass_filter_genes) != len(set(pass_filter_genes)): - main_warning("gene names are not unique, please check your preprocessing procedure.") - subset_adata = adata[:, pass_filter_genes] - if n_top_genes is None: - main_info("n_top_genes is None, reserve all genes and add filter gene information") - n_top_genes = adata.n_vars - layer_mat = DKM.select_layer_data(subset_adata, layer) - if nan_replace_val: - main_info("replacing nan values with: %s" % (nan_replace_val)) - _mask = get_nan_or_inf_data_bool_mask(layer_mat) - layer_mat[_mask] = nan_replace_val - - main_info("select genes by recipe: " + recipe) - if recipe == "svr": - mean, variance, highly_variable_mask, highly_variable_scores = select_genes_by_dispersion_svr( - subset_adata, layer_mat, n_top_genes - ) - elif recipe == "seurat": - mean, variance, highly_variable_mask, highly_variable_scores = select_genes_by_seurat_recipe( - subset_adata, - layer_mat, - min_disp=seurat_min_disp, - max_disp=seurat_max_disp, - min_mean=seurat_min_mean, - max_mean=seurat_max_mean, - n_top_genes=n_top_genes, - ) - elif recipe == "monocle": - # TODO refactor dynamo monocle selection genes part code and make it modular (same as the two functions above) - # the logics here for dynamo recipe is different from the above recipes - # Note we do not need to pass subset_adata here because monocle takes care of everything regarding dynamo - # convention - select_genes_monocle(adata, **monocle_kwargs) - adata.var[DKM.VAR_GENE_HIGHLY_VARIABLE_KEY] = adata.var[DKM.VAR_USE_FOR_PCA] - return - else: - raise NotImplementedError("Selected gene seletion recipe not supported.") - - main_info_insert_adata_var(DKM.VAR_GENE_MEAN_KEY) - main_info_insert_adata_var(DKM.VAR_GENE_VAR_KEY) - main_info_insert_adata_var(DKM.VAR_GENE_HIGHLY_VARIABLE_KEY) - main_debug("type of variance:" + str(type(variance))) - main_debug("shape of variance:" + str(variance.shape)) - adata.var[DKM.VAR_GENE_MEAN_KEY] = np.nan - adata.var[DKM.VAR_GENE_VAR_KEY] = np.nan - adata.var[DKM.VAR_GENE_HIGHLY_VARIABLE_KEY] = False - adata.var[DKM.VAR_USE_FOR_PCA] = False - - adata.var[DKM.VAR_GENE_MEAN_KEY][pass_filter_genes] = mean.flatten() - adata.var[DKM.VAR_GENE_VAR_KEY][pass_filter_genes] = variance - adata.var[DKM.VAR_GENE_HIGHLY_VARIABLE_KEY][pass_filter_genes] = highly_variable_mask - adata.var[DKM.VAR_USE_FOR_PCA][pass_filter_genes] = highly_variable_mask - - main_info("number of selected highly variable genes: " + str(adata.var[DKM.VAR_USE_FOR_PCA].sum())) - if recipe == "svr": - # SVR can give highly_variable_scores - main_info_insert_adata_var(DKM.VAR_GENE_HIGHLY_VARIABLE_SCORES) - adata.var[DKM.VAR_GENE_HIGHLY_VARIABLE_SCORES] = np.nan - adata.var[DKM.VAR_GENE_HIGHLY_VARIABLE_SCORES][pass_filter_genes] = highly_variable_scores.flatten() - - if inplace: - main_info("inplace is True, subset adata according to selected genes.") - adata = adata[:, adata.var[DKM.VAR_USE_FOR_PCA]] - main_finish_progress("filter genes by dispersion") - - -def select_genes_by_seurat_recipe( - adata: AnnData, - sparse_layer_mat: csr_matrix, - n_bins: int = 20, - log_mean_and_dispersion: bool = True, - min_disp: float = None, - max_disp: float = None, - min_mean: float = None, - max_mean: float = None, - n_top_genes: Union[int, None] = None, -) -> Tuple[np.ndarray, np.ndarray, np.ndarray, None]: - """Apply seurat's gene selection recipe by cutoffs. - - Args: - adata: an AnnData object - sparse_layer_mat: the sparse matrix used for gene selection. - n_bins: the number of bins for normalization. Defaults to 20. - log_mean_and_dispersion: whether log the gene expression values before calculating the dispersion values. - Defaults to True. - min_disp: seurat dispersion min cutoff. Defaults to None. - max_disp: seurat dispersion max cutoff. Defaults to None. - min_mean: seurat mean min cutoff. Defaults to None. - max_mean: seurat mean max cutoff. Defaults to None. - n_top_genes: number of top genes to be evaluated. If set to be None, genes are filtered by mean and dispersion - norm threshold. Defaults to None. - - Returns: - A tuple (mean, variance, highly_variable_mask, highly_variable_scores), where mean is the mean of the provided - sparse matrix, variance is the variance of the provided sparse matrix, highly_variable_mask is a bool array - indicating whether an element (a gene) is highly variable in the matrix. highly_variable_scores is always none - since the scores are not applicable to Seurat recipe. - """ - - # default values from Seurat - if min_disp is None: - min_disp = 0.5 - if max_disp is None: - max_disp = np.inf - if min_mean is None: - min_mean = 0.0125 - if max_mean is None: - max_mean = 3 - - mean, variance, dispersion = calc_mean_var_dispersion_sparse(sparse_layer_mat) - sc_mean, sc_var = seurat_get_mean_var(sparse_layer_mat) - mean, variance = sc_mean, sc_var - dispersion = variance / mean - - if log_mean_and_dispersion: - mean = np.log1p(mean) - dispersion[np.equal(dispersion, 0)] = np.nan - dispersion = np.log(dispersion) - - temp_df = pd.DataFrame() - temp_df["mean"], temp_df["dispersion"] = mean, dispersion - - temp_df["mean_bin"] = pd.cut(temp_df["mean"], bins=n_bins) - disp_grouped = temp_df.groupby("mean_bin")["dispersion"] - disp_mean_bin = disp_grouped.mean() - disp_std_bin = disp_grouped.std(ddof=1) - - # handle nan std - one_gene_per_bin = disp_std_bin.isnull() - - disp_std_bin[one_gene_per_bin] = disp_mean_bin[one_gene_per_bin].values - disp_mean_bin[one_gene_per_bin] = 0 - - # normalized dispersion - mean = disp_mean_bin[temp_df["mean_bin"].values].values - std = disp_std_bin[temp_df["mean_bin"].values].values - variance = std**2 - temp_df["dispersion_norm"] = ((temp_df["dispersion"] - mean) / std).fillna(0) - dispersion_norm = temp_df["dispersion_norm"].values - - highly_variable_mask = None - if n_top_genes is not None: - main_info("choose %d top genes" % (n_top_genes), indent_level=2) - threshold = temp_df["dispersion_norm"].nlargest(n_top_genes).values[-1] - highly_variable_mask = temp_df["dispersion_norm"].values >= threshold - else: - main_info("choose genes by mean and dispersion norm threshold", indent_level=2) - highly_variable_mask = np.logical_and.reduce( - ( - mean > min_mean, - mean < max_mean, - dispersion_norm > min_disp, - dispersion_norm < max_disp, - ) - ) - return mean, variance, highly_variable_mask, None - - -def select_genes_by_dispersion_svr( - adata: AnnData, layer_mat: Union[np.array, csr_matrix], n_top_genes: int -) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: - """Filters adata's genes according to layer_mat, and set adata's preprocess keys for downstream analysis - - Args: - adata: an AnnData object. - layer_mat: the matrix used for select genes with shape of #cells X #genes. - n_top_genes: the number of genes to use. - - Returns: - A tuple (mean, variance, highly_variable_mask, highly_variable_scores), where mean is the mean of the provided - sparse matrix, variance is the variance of the provided sparse matrix, highly_variable_mask is a bool array - indicating whether an element (a gene) is highly variable in the matrix, and highly_variable_scores is an array - storing the dispersion score for each gene. - """ - - main_debug("type of layer_mat:" + str(type(layer_mat))) - if issparse(layer_mat): - main_info("layer_mat is sparse, dispatch to sparse calc function...") - mean, variance, dispersion = calc_mean_var_dispersion_sparse(layer_mat) - else: - main_info("layer_mat is np, dispatch to sparse calc function...") - mean, variance, dispersion = calc_mean_var_dispersion_ndarray(layer_mat) - - highly_variable_mask, highly_variable_scores = get_highly_variable_mask_by_dispersion_svr( - mean, variance, n_top_genes - ) - variance = np.array(variance).flatten() - - return mean.flatten(), variance, highly_variable_mask, highly_variable_scores - - -# Highly variable gene selection function: -def get_highvar_genes_sparse( - expression: Union[ - np.ndarray, - scipy.sparse.csr_matrix, - scipy.sparse.csc_matrix, - scipy.sparse.coo_matrix, - ], - expected_fano_threshold: Optional[float] = None, - numgenes: Optional[int] = None, - minimal_mean: float = 0.5, -) -> Tuple[pd.DataFrame, Dict]: - """Find highly-variable genes in sparse single-cell data matrices. - - Args: - expression: Gene expression matrix - expected_fano_threshold: Optionally can be used to set a manual dispersion threshold (for definition of - "highly-variable") - numgenes: Optionally can be used to find the n most variable genes - minimal_mean: Sets a threshold on the minimum mean expression to consider - - Returns: - gene_counts_stats: Results dataframe containing pertinent information for each gene - gene_fano_parameters: Additional informative dictionary (w/ records of dispersion for each gene, threshold, - etc.) - """ - gene_mean = np.array(expression.mean(axis=0)).astype(float).reshape(-1) - E2 = expression.copy() - E2.data **= 2 - gene2_mean = np.array(E2.mean(axis=0)).reshape(-1) - gene_var = pd.Series(gene2_mean - (gene_mean**2)) - del E2 - gene_mean = pd.Series(gene_mean) - gene_fano = gene_var / gene_mean - - # Find parameters for expected fano line -- this line can be non-linear... - top_genes = gene_mean.sort_values(ascending=False)[:20].index - A = (np.sqrt(gene_var) / gene_mean)[top_genes].min() - - w_mean_low, w_mean_high = gene_mean.quantile([0.10, 0.90]) - w_fano_low, w_fano_high = gene_fano.quantile([0.10, 0.90]) - winsor_box = ( - (gene_fano > w_fano_low) & (gene_fano < w_fano_high) & (gene_mean > w_mean_low) & (gene_mean < w_mean_high) - ) - fano_median = gene_fano[winsor_box].median() - B = np.sqrt(fano_median) - - gene_expected_fano = (A**2) * gene_mean + (B**2) - fano_ratio = gene_fano / gene_expected_fano - - # Identify high var genes - if numgenes is not None: - highvargenes = fano_ratio.sort_values(ascending=False).index[:numgenes] - high_var_genes_ind = fano_ratio.index.isin(highvargenes) - T = None - else: - if not expected_fano_threshold: - T = 1.0 + gene_fano[winsor_box].std() - else: - T = expected_fano_threshold - - high_var_genes_ind = (fano_ratio > T) & (gene_mean > minimal_mean) - - gene_counts_stats = pd.DataFrame( - { - "mean": gene_mean, - "var": gene_var, - "fano": gene_fano, - "expected_fano": gene_expected_fano, - "high_var": high_var_genes_ind, - "fano_ratio": fano_ratio, - } - ) - gene_fano_parameters = { - "A": A, - "B": B, - "T": T, - "minimal_mean": minimal_mean, - } - return (gene_counts_stats, gene_fano_parameters) - - -def SVRs( - adata_ori: anndata.AnnData, - filter_bool: Union[np.ndarray, None] = None, - layers: str = "X", - relative_expr: bool = True, - total_szfactor: str = "total_Size_Factor", - min_expr_cells: int = 0, - min_expr_avg: int = 0, - max_expr_avg: int = np.inf, - svr_gamma: Union[float, None] = None, - winsorize: bool = False, - winsor_perc: Tuple[float, float] = (1, 99.5), - sort_inverse: bool = False, - use_all_genes_cells: bool = False, -) -> anndata.AnnData: - """Support Vector Regression to identify highly variable genes. - - This function is modified from https://github.com/velocyto-team/velocyto.py/blob/master/velocyto/analysis.py - - Args: - adata_ori: an AnnData object - filter_bool: A boolean array from the user to select genes for downstream analysis. Defaults to None. - layers: The layer(s) to be used for calculating dispersion score via support vector regression (SVR). Defaults - to "X". - relative_expr: A logic flag to determine whether we need to divide gene expression values first by size factor - before run SVR. Defaults to True. - total_szfactor: The column name in the .obs attribute that corresponds to the size factor for the total mRNA. - Defaults to "total_Size_Factor". - min_expr_cells: minimum number of cells that express the gene for it to be considered in the fit. Defaults to 0. - min_expr_avg: The minimum average of genes across cells required for gene to be selected for SVR analyses. - Defaults to 0. - max_expr_avg: The maximum average of genes across cells required for gene to be selected for SVR analyses. Genes - with average gene expression larger than this value will be treated as house-keeping/outlier genes. Defaults - to np.inf. - svr_gamma: the gamma hyper-parameter of the SVR. Defaults to None. - winsorize: Wether to winsorize the data for the cv vs mean model. Defaults to False. - winsor_perc: the up and lower bound of the winsorization. Defaults to (1, 99.5). - sort_inverse: whether to sort genes from less noisy to more noisy (to use for size estimation not for feature - selection). Defaults to False. - use_all_genes_cells: A logic flag to determine whether all cells and genes should be used for the size factor - calculation. Defaults to False. - - Returns: - An updated annData object with `log_m`, `log_cv`, `score` added to .obs columns and `SVR` added to uns attribute - as a new key. - """ - - layers = DKM.get_available_layer_keys(adata_ori, layers) - - if use_all_genes_cells: - # let us ignore the `inplace` parameter in pandas.Categorical.remove_unused_categories warning. - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - adata = adata_ori[:, filter_bool].copy() if filter_bool is not None else adata_ori - else: - cell_inds = adata_ori.obs.use_for_pca if "use_for_pca" in adata_ori.obs.columns else adata_ori.obs.index - filter_list = ["use_for_pca", "pass_basic_filter"] - filter_checker = [i in adata_ori.var.columns for i in filter_list] - which_filter = np.where(filter_checker)[0] - - gene_inds = adata_ori.var[filter_list[which_filter[0]]] if len(which_filter) > 0 else adata_ori.var.index - - # let us ignore the `inplace` parameter in pandas.Categorical.remove_unused_categories warning. - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - adata = adata_ori[cell_inds, gene_inds].copy() - filter_bool = filter_bool[gene_inds] - - for layer in layers: - if layer == "raw": - CM = adata.X.copy() if adata.raw is None else adata.raw - szfactors = ( - adata.obs[layer + "_Size_Factor"].values[:, None] - if adata.raw.X is not None - else adata.obs["Size_Factor"].values[:, None] - ) - elif layer == "X": - CM = adata.X.copy() - szfactors = adata.obs["Size_Factor"].values[:, None] - elif layer == "protein": - if "protein" in adata.obsm_keys(): - CM = adata.obsm["protein"].copy() - szfactors = adata.obs[layer + "_Size_Factor"].values[:, None] - else: - continue - else: - CM = adata.layers[layer].copy() - szfactors = ( - adata.obs[layer + "_Size_Factor"].values[:, None] - if layer + "_Size_Factor" in adata.obs.columns - else None - ) - - if total_szfactor is not None and total_szfactor in adata.obs.keys(): - szfactors = adata.obs[total_szfactor].values[:, None] if total_szfactor in adata.obs.columns else None - - if szfactors is not None and relative_expr: - if issparse(CM): - sparsefuncs.inplace_row_scale(CM, 1 / szfactors) - else: - CM /= szfactors - - if winsorize: - if min_expr_cells <= ((100 - winsor_perc[1]) * CM.shape[0] * 0.01): - min_expr_cells = int(np.ceil((100 - winsor_perc[1]) * CM.shape[1] * 0.01)) + 2 - - detected_bool = np.array( - ((CM > 0).sum(0) >= min_expr_cells) & (CM.mean(0) <= max_expr_avg) & (CM.mean(0) >= min_expr_avg) - ).flatten() - - valid_CM = CM[:, detected_bool] - if winsorize: - down, up = ( - np.percentile(valid_CM.A, winsor_perc, 0) - if issparse(valid_CM) - else np.percentile(valid_CM, winsor_perc, 0) - ) - Sfw = ( - np.clip(valid_CM.A, down[None, :], up[None, :]) - if issparse(valid_CM) - else np.percentile(valid_CM, winsor_perc, 0) - ) - mu = Sfw.mean(0) - sigma = Sfw.std(0, ddof=1) - else: - mu = np.array(valid_CM.mean(0)).flatten() - sigma = ( - np.array( - np.sqrt( - (valid_CM.multiply(valid_CM).mean(0).A1 - (mu) ** 2) - # * (adata.n_obs) - # / (adata.n_obs - 1) - ) - ) - if issparse(valid_CM) - else valid_CM.std(0, ddof=1) - ) - - cv = sigma / mu - log_m = np.array(np.log2(mu)).flatten() - log_cv = np.array(np.log2(cv)).flatten() - log_m[mu == 0], log_cv[mu == 0] = 0, 0 - - if svr_gamma is None: - svr_gamma = 150.0 / len(mu) - # Fit the Support Vector Regression - clf = SVR(gamma=svr_gamma) - # clf.fit(log_m[:, None], log_cv) - - (gene_counts_stats, gene_fano_parameters) = get_highvar_genes_sparse(valid_CM) - target = np.array(gene_counts_stats["fano"]).flatten() - ground = np.array(gene_counts_stats["mean"]).flatten()[:, None] - clf.fit(ground, target) - - fitted_fun = clf.predict - # ff = fitted_fun(log_m[:, None]) - # score = log_cv - ff - ff = fitted_fun(ground) - score = target - ff - if sort_inverse: - score = -score - - prefix = "" if layer == "X" else layer + "_" - (adata.var[prefix + "log_m"], adata.var[prefix + "log_cv"], adata.var[prefix + "score"],) = ( - np.nan, - np.nan, - -np.inf, - ) - ( - adata.var.loc[detected_bool, prefix + "log_m"], - adata.var.loc[detected_bool, prefix + "log_cv"], - adata.var.loc[detected_bool, prefix + "score"], - ) = ( - np.array(ground).flatten(), - np.array(target).flatten(), - np.array(score).flatten(), - ) - - key = "velocyto_SVR" if layer == "raw" or layer == "X" else layer + "_velocyto_SVR" - adata_ori.uns[key] = {"SVR": fitted_fun} - - adata_ori = merge_adata_attrs(adata_ori, adata, attr="var") - - return adata_ori - - -def select_genes_monocle( - adata: anndata.AnnData, - layer: str = "X", - total_szfactor: str = "total_Size_Factor", - keep_filtered: bool = True, - sort_by: Literal["SVR", "gini", "dispersion"] = "SVR", - n_top_genes: int = 2000, - SVRs_kwargs: dict = {}, - only_bools: bool = False, - exprs_frac_for_gene_exclusion: float = 1, - genes_to_exclude: Union[List[str], None] = None, -) -> Union[anndata.AnnData, np.ndarray]: - """Select genes based on monocle recipe. - - This version is here for modularization of preprocessing, so that users may try combinations of different - preprocessing procesudres in Preprocessor. - - Args: - adata: an AnnData object. - layer: The data from a particular layer (include X) used for feature selection. Defaults to "X". - total_szfactor: The column name in the .obs attribute that corresponds to the size factor for the total mRNA. - Defaults to "total_Size_Factor". - keep_filtered: Whether to keep genes that don't pass the filtering in the adata object. Defaults to True. - sort_by: the sorting methods, either SVR, dispersion or Gini index, to be used to select genes. Defaults to - "SVR". - n_top_genes: the number of top genes based on scoring method (specified by sort_by) will be selected as feature - genes. Defaults to 2000. - SVRs_kwargs: kwargs for `SVRs`. Defaults to {}. - only_bools: Only return a vector of bool values. Defaults to False. - exprs_frac_for_gene_exclusion: threshold of fractions for high fraction genes. Defaults to 1. - genes_to_exclude: genes that are excluded from evaluation. Defaults to None. - - Returns: - The adata object with genes updated if `only_bools` is false. Otherwise, the bool array representing selected - genes. - """ - - # The following size factor calculation is now a prerequisite for monocle recipe preprocess in preprocessor. - adata = calc_sz_factor( - adata, - total_layers=adata.uns["pp"]["experiment_total_layers"], - scale_to=None, - splicing_total_layers=False, - X_total_layers=False, - layers=adata.uns["pp"]["experiment_layers"], - genes_use_for_norm=None, - ) - - filter_bool = ( - adata.var["pass_basic_filter"] - if "pass_basic_filter" in adata.var.columns - else np.ones(adata.shape[1], dtype=bool) - ) - - if adata.shape[1] <= n_top_genes: - filter_bool = np.ones(adata.shape[1], dtype=bool) - else: - if sort_by == "dispersion": - table = top_table(adata, layer, mode="dispersion") - valid_table = table.query("dispersion_empirical > dispersion_fit") - valid_table = valid_table.loc[ - set(adata.var.index[filter_bool]).intersection(valid_table.index), - :, - ] - gene_id = np.argsort(-valid_table.loc[:, "dispersion_empirical"])[:n_top_genes] - gene_id = valid_table.iloc[gene_id, :].index - filter_bool = adata.var.index.isin(gene_id) - elif sort_by == "gini": - table = top_table(adata, layer, mode="gini") - valid_table = table.loc[filter_bool, :] - gene_id = np.argsort(-valid_table.loc[:, "gini"])[:n_top_genes] - gene_id = valid_table.index[gene_id] - filter_bool = gene_id.isin(adata.var.index) - elif sort_by == "SVR": - SVRs_args = { - "min_expr_cells": 0, - "min_expr_avg": 0, - "max_expr_avg": np.inf, - "svr_gamma": None, - "winsorize": False, - "winsor_perc": (1, 99.5), - "sort_inverse": False, - } - SVRs_args = update_dict(SVRs_args, SVRs_kwargs) - adata = SVRs( - adata, - layers=[layer], - total_szfactor=total_szfactor, - filter_bool=filter_bool, - **SVRs_args, - ) - filter_bool = get_svr_filter(adata, layer=layer, n_top_genes=n_top_genes, return_adata=False) - # elif sort_by == "fano": - - # filter genes by gene expression fraction as well - adata.var["frac"], invalid_ids = compute_gene_exp_fraction(X=adata.X, threshold=exprs_frac_for_gene_exclusion) - genes_to_exclude = ( - list(adata.var_names[invalid_ids]) - if genes_to_exclude is None - else genes_to_exclude + list(adata.var_names[invalid_ids]) - ) - if genes_to_exclude is not None and len(genes_to_exclude) > 0: - adata_exclude_genes = adata.var.index.intersection(genes_to_exclude) - adata.var.loc[adata_exclude_genes, "use_for_pca"] = False - - if keep_filtered: - adata.var["use_for_pca"] = filter_bool - else: - adata._inplace_subset_var(filter_bool) - adata.var["use_for_pca"] = True - - return filter_bool if only_bools else adata - - -def get_highly_variable_mask_by_dispersion_svr( - mean: np.ndarray, - var: np.ndarray, - n_top_genes: int, - svr_gamma: Union[float, None] = None, - return_scores: bool = True, -) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray]: - """Returns the mask with shape same as mean and var. - - The mask indicates whether each index is highly variable or not. Each index should represent a gene. - - Args: - mean: mean of the genes. - var: variance of the genes. - n_top_genes: the number of top genes to be inspected. - svr_gamma: coefficient for support vector regression. Defaults to None. - return_scores: whether returen the dispersion scores. Defaults to True. - - Returns: - A tuple (highly_variable_mask, scores) where highly_variable_mask is a bool array indicating whether an element - (a gene) is highly variable in the matrix and scores is an array recording variable score for each gene. scores - would only be returned when `return_scores` is True. - """ - - # normally, select svr_gamma based on #features - if svr_gamma is None: - svr_gamma = 150.0 / len(mean) - from sklearn.svm import SVR - - mean_log = np.log2(mean) - cv_log = np.log2(np.sqrt(var) / mean) - classifier = SVR(gamma=svr_gamma) - # fit&preidction will complain about nan values if not take cared here - is_nan_indices = np.logical_or(np.isnan(mean_log), np.isnan(cv_log)) - if np.sum(is_nan_indices) > 0: - main_warning( - ( - "mean and cv_log contain NAN values. We exclude them in SVR training. Please use related gene filtering " - "methods to filter genes with zero means." - ) - ) - - classifier.fit(mean_log[~is_nan_indices, np.newaxis], cv_log.reshape([-1, 1])[~is_nan_indices]) - scores = np.repeat(np.nan, len(mean_log)) - # TODO handle nan values during prediction here - scores[~is_nan_indices] = cv_log[~is_nan_indices] - classifier.predict(mean_log[~is_nan_indices, np.newaxis]) - scores = scores.reshape([-1, 1]) # shape should be #genes x 1 - - # score threshold based on n top genes - n_top_genes = min(n_top_genes, len(mean)) # maybe not enough genes there - score_threshold = np.sort(-scores)[n_top_genes - 1] - highly_variable_mask = scores >= score_threshold - highly_variable_mask = np.array(highly_variable_mask).flatten() - if return_scores: - return highly_variable_mask, scores - return highly_variable_mask - - def log1p_adata_layer(adata: AnnData, layer: str = DKM.X_LAYER, copy: bool = False) -> AnnData: """Calculate log1p of adata's specific layer. From 2d1c727763fd3a90eea7dbe86a68c3613659cdb0 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Tue, 4 Apr 2023 20:06:22 -0400 Subject: [PATCH 05/28] Fix to gini function a lot faster --- dynamo/preprocessing/preprocess.py | 82 +++++++++++++++--------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/dynamo/preprocessing/preprocess.py b/dynamo/preprocessing/preprocess.py index 9c6cad63e..412b2f1c3 100755 --- a/dynamo/preprocessing/preprocess.py +++ b/dynamo/preprocessing/preprocess.py @@ -308,14 +308,16 @@ def normalize_cell_expr_by_size_factors_legacy( def Gini(adata: anndata.AnnData, layers: Union[Literal["all"], List[str]] = "all") -> anndata.AnnData: - """Calculate the Gini coefficient of a numpy array. https://github.com/thomasmaxwellnorman/perturbseq_demo/blob/master/perturbseq/util.py + """Calculate the Gini coefficient of a numpy array. + https://github.com/thomasmaxwellnorman/perturbseq_demo/blob/master/perturbseq/util.py Args: adata: an AnnData object layers: the layer(s) to be normalized. Defaults to "all". Returns: - An updated anndata object with gini score for the layers (include .X) in the corresponding var columns (layer + '_gini'). + An updated anndata object with gini score for the layers (include .X) in the corresponding var columns + (layer + '_gini'). """ # From: https://github.com/oliviaguest/gini @@ -324,6 +326,34 @@ def Gini(adata: anndata.AnnData, layers: Union[Literal["all"], List[str]] = "all layers = DynamoAdataKeyManager.get_available_layer_keys(adata, layers) + def compute_gini(CM): + # convert to dense array if sparse + if issparse(CM): + CM = CM.A + + # shift all values to be non-negative + CM -= np.min(CM) + + # add small constant to avoid zeros + CM = CM.astype(float) + 0.0000001 # values cannot be 0 + + # sort values along axis 0 + CM = np.sort(CM, axis=0) + + # compute index array + n = CM.shape[0] + index = np.arange(1, n + 1) + + # compute numerator and denominator of Gini coefficient + test = 2 * index - n - 1 + numerator = np.sum(test[:, np.newaxis] * CM, axis=0) + denominator = n * np.sum(CM, axis=0) + + # compute Gini coefficient for each feature + gini = numerator / denominator + + return gini + for layer in layers: if layer == "raw": CM = adata.raw.X @@ -337,44 +367,12 @@ def Gini(adata: anndata.AnnData, layers: Union[Literal["all"], List[str]] = "all else: CM = adata.layers[layer] - n_features = adata.shape[1] - gini = np.zeros(n_features) - - for i in np.arange(n_features): - # all values are treated equally, arrays must be 1d - cur_cm = CM[:, i].A if issparse(CM) else CM[:, i] - if np.amin(CM) < 0: - cur_cm -= np.amin(cur_cm) # values cannot be negative - cur_cm = cur_cm.astype(float) + 0.0000001 # np.min(array[array!=0]) #values cannot be 0 - cur_cm = np.sort(cur_cm, axis=0) # values must be sorted - # index per array element - index = np.arange(1, cur_cm.shape[0] + 1) - n = cur_cm.shape[0] # number of array elements - aa = np.sum((2 * index - n - 1) * cur_cm) - bb = n * np.sum(cur_cm) - # gini[i] = (np.sum((2 * index - n - 1) * cur_cm)) / (n * np.sum(cur_cm)) # Gini coefficient - gini[i] = aa / bb - print(i, aa, bb, gini[i]) - - # # all values are treated equally, arrays must be 1d - # cur_cm = CM.toarray() if issparse(CM) else CM.copy() - # cur_cm[cur_cm < 0] = 0 # values cannot be negative - # cur_cm = cur_cm.astype(float) + 0.0000001 # values cannot be 0 - # cur_cm = np.sort(cur_cm, axis=0) # values must be sorted - # # index per array element - # index = np.arange(1, cur_cm.shape[0] + 1) - # n = cur_cm.shape[0] # number of array elements - # ccc = (2 * index - n - 1)[:, np.newaxis] * cur_cm - # aa = np.sum((2 * index - n - 1)[:, np.newaxis] * cur_cm, axis=0) - # bb = n * np.sum(cur_cm, axis=0) - # gini = aa / bb - # print(i, aa, bb, ccc, gini[i]) - # #gini = (np.sum((2 * index - n - 1) * cur_cm, axis=0)) / (n * np.sum(cur_cm, axis=0)) # Gini coefficient - # - # if layer in ["raw", "X"]: - # adata.var["gini"] = gini - # else: - # adata.var[layer + "_gini"] = gini + var_gini = compute_gini(CM) + + if layer in ["raw", "X"]: + adata.var["gini"] = var_gini + else: + adata.var[layer + "_gini"] = var_gini return adata @@ -1415,7 +1413,7 @@ def recipe_velocyto( adata = adata[:, filter_bool] - adata = get_highly_variable_genes_by_svr( + adata = select_genes_by_svr( adata, layers=["spliced"], min_expr_cells=2, @@ -1658,7 +1656,7 @@ def select_genes_monocle_legacy( "sort_inverse": False, } SVRs_args = update_dict(SVRs_args, SVRs_kwargs) - adata = get_highly_variable_genes_by_svr( + adata = select_genes_by_svr( adata, layers=[layer], total_szfactor=total_szfactor, From a4ffc8c8f059c928282c3ad944cdea89579c9971 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Wed, 5 Apr 2023 18:34:36 -0400 Subject: [PATCH 06/28] fixed a bug in gini and added docstring --- dynamo/preprocessing/__init__.py | 2 +- dynamo/preprocessing/gene_selection.py | 219 +++++++++++++++---------- dynamo/preprocessing/preprocess.py | 70 -------- 3 files changed, 136 insertions(+), 155 deletions(-) diff --git a/dynamo/preprocessing/__init__.py b/dynamo/preprocessing/__init__.py index 90dfa63a7..89ccb428d 100755 --- a/dynamo/preprocessing/__init__.py +++ b/dynamo/preprocessing/__init__.py @@ -4,7 +4,6 @@ from .cell_cycle import cell_cycle_scores from .dynast import lambda_correction from .preprocess import ( - Gini, calc_sz_factor_legacy, filter_cells_by_outliers, filter_cells_legacy, @@ -37,6 +36,7 @@ from .CnmfPreprocessor import CnmfPreprocessor from .gene_selection import ( + Gini, estimate_dispersion, select_genes_by_svr, select_genes_monocle, diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 2af9f81bc..a02b4e935 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -1,6 +1,8 @@ import re import warnings -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union + +from numpy import ndarray try: from typing import Literal @@ -14,7 +16,7 @@ from anndata import AnnData from scipy.sparse import csr_matrix, issparse -from ..configuration import DKM, DynamoAdataConfig, DynamoAdataKeyManager +from ..configuration import DKM from ..dynamo_logger import ( LoggerManager, main_debug, @@ -23,7 +25,6 @@ main_warning, ) from .preprocessor_utils import ( - calc_mean_var_dispersion_sparse, calc_sz_factor, get_nan_or_inf_data_bool_mask, get_svr_filter, @@ -32,6 +33,67 @@ from .utils import compute_gene_exp_fraction, cook_dist, merge_adata_attrs +def Gini(adata: AnnData, layers: Union[Literal["all"], List[str]] = "all") -> AnnData: + """Calculate the Gini coefficient of a numpy array. + https://github.com/thomasmaxwellnorman/perturbseq_demo/blob/master/perturbseq/util.py + + Args: + adata: an AnnData object + layers: the layer(s) to be normalized. Defaults to "all". + + Returns: + An updated anndata object with gini score for the layers (include .X) in the corresponding var columns + (layer + '_gini'). + """ + + # From: https://github.com/oliviaguest/gini + # based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif + # from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm + + layers = DKM.get_available_layer_keys(adata, layers) + + def compute_gini(CM): + # convert to dense array if sparse + if issparse(CM): + CM = CM.A + + # shift all values to be non-negative + CM -= np.min(CM) + + # add small constant to avoid zeros + CM = CM.astype(float) + 0.0000001 # values cannot be 0 + + # sort values along axis 0 + CM = np.sort(CM, axis=0) + + # compute index array + n = CM.shape[0] + index = 2 * (np.arange(1, n + 1)) - n - 1 + + # compute Gini coefficient for each feature + gini = (np.sum(index[:, np.newaxis] * CM, axis=0)) / (n * np.sum(CM, axis=0)) + + return gini + + for layer in layers: + if layer == "raw": + CM = adata.raw.X + elif layer == "X": + CM = adata.X + elif layer == "protein": + if "protein" in adata.obsm_keys(): + CM = adata.obsm[layer] + else: + continue + else: + CM = adata.layers[layer] + + var_gini = compute_gini(CM) + adata.var[layer + "_gini"] = var_gini + + return adata + + def parametric_dispersion_fit( disp_table: pd.DataFrame, initial_coefs: np.ndarray = np.array([1e-6, 1]) ) -> Tuple[sm.formula.glm, np.ndarray, pd.DataFrame]: @@ -46,7 +108,7 @@ def parametric_dispersion_fit( Returns: A tuple (fit, coefs, good), where fit is a statsmodels fitting object, coefs contains the two resulting gamma - fitting coefficient, and good is the the subsetted dispersion table that is subjected to Gamma fitting. + fitting coefficient, and good is the subsetted dispersion table that is subjected to Gamma fitting. """ coefs = initial_coefs @@ -100,7 +162,7 @@ def disp_calc_helper_NB( layers: a list of layers available. res_list: a list of pd.DataFrames with mu, dispersion for each gene that passes filters. """ - layers = DynamoAdataKeyManager.get_available_layer_keys(adata, layers=layers, include_protein=False) + layers = DKM.get_available_layer_keys(adata, layers=layers, include_protein=False) res_list = [] for layer in layers: @@ -276,7 +338,7 @@ def top_table(adata: AnnData, layer: str = "X", mode: Literal["dispersion", "gin the columns. """ - layer = DynamoAdataKeyManager.get_available_layer_keys(adata, layers=layer, include_protein=False)[0] + layer = DKM.get_available_layer_keys(adata, layers=layer, include_protein=False)[0] if layer in ["X"]: key = "dispFitInfo" @@ -318,7 +380,7 @@ def select_genes_monocle( exprs_frac_for_gene_exclusion: float = 1, genes_to_exclude: Union[List[str], None] = None, SVRs_kwargs: dict = {}, -) -> Union[AnnData, np.ndarray]: +): """Select genes based on monocle recipe. This version is here for modularization of preprocessing, so that users may try combinations of different @@ -327,17 +389,14 @@ def select_genes_monocle( Args: adata: an AnnData object. layer: The data from a particular layer (include X) used for feature selection. Defaults to "X". - total_szfactor: The column name in the .obs attribute that corresponds to the size factor for the total mRNA. - Defaults to "total_Size_Factor". keep_filtered: Whether to keep genes that don't pass the filtering in the adata object. Defaults to True. - sort_by: the sorting methods, either SVR, dispersion or Gini index, to be used to select genes. Defaults to - "SVR". n_top_genes: the number of top genes based on scoring method (specified by sort_by) will be selected as feature genes. Defaults to 2000. - SVRs_kwargs: kwargs for `SVRs`. Defaults to {}. - only_bools: Only return a vector of bool values. Defaults to False. + sort_by: the sorting methods, either SVR, dispersion or Gini index, to be used to select genes. Defaults to + "SVR". TODO: Should be fixed! exprs_frac_for_gene_exclusion: threshold of fractions for high fraction genes. Defaults to 1. genes_to_exclude: genes that are excluded from evaluation. Defaults to None. + SVRs_kwargs: kwargs for `SVRs`. Defaults to {}. Returns: The adata object with genes updated if `only_bools` is false. Otherwise, the bool array representing selected @@ -364,52 +423,14 @@ def select_genes_monocle( if adata.shape[1] <= n_top_genes: filter_bool = np.ones(adata.shape[1], dtype=bool) else: - # table = top_table(adata, layer, mode="dispersion") - # valid_table = table.query("dispersion_empirical > dispersion_fit") - # valid_table = valid_table.loc[ - # set(adata.var.index[filter_bool]).intersection(valid_table.index), - # :, - # ] - # gene_id = np.argsort(-valid_table.loc[:, "dispersion_empirical"])[:n_top_genes] - # gene_id = valid_table.iloc[gene_id, :].index - # filter_bool = adata.var.index.isin(gene_id) if sort_by == "gini": - # table = top_table(adata, layer, mode="gini") - valid_table = adata.var[layer + "_gini"].loc[filter_bool, :] - gene_id = np.argsort(-valid_table.loc[:, "gini"])[:n_top_genes] - gene_id = valid_table.index[gene_id] - filter_bool = gene_id.isin(adata.var.index) - # elif : - # SVRs_args = { - # "min_expr_cells": 0, - # "min_expr_avg": 0, - # "max_expr_avg": np.inf, - # "svr_gamma": None, - # "winsorize": False, - # "winsor_perc": (1, 99.5), - # "sort_inverse": False, - # } - # SVRs_args = update_dict(SVRs_args, SVRs_kwargs) - # adata = SVRs( - # adata, - # layers=[layer], - # total_szfactor=total_szfactor, - # filter_bool=filter_bool, - # **SVRs_args, - # ) - # filter_bool = get_svr_filter(adata, layer=layer, n_top_genes=n_top_genes, return_adata=False) + if layer + "_gini" is not adata.var.keys(): + Gini(adata) + valid_table = adata.var[layer + "_gini"][filter_bool] + feature_gene_idx = np.argsort(-valid_table)[:n_top_genes] + feature_gene_idx = valid_table.index[feature_gene_idx] + filter_bool = filter_bool.index.isin(feature_gene_idx) elif sort_by == "cv_dispersion" or sort_by == "fano_dispersion": - # These parameters are already defined as default values in SVRs function. Do we still need this? - # SVRs_args = { - # "min_expr_cells": 0, - # "min_expr_avg": 0, - # "max_expr_avg": np.inf, - # "svr_gamma": None, - # "winsorize": False, - # "winsor_perc": (1, 99.5), - # "sort_inverse": False, - # } - # SVRs_args = update_dict(SVRs_args, SVRs_kwargs) adata = select_genes_by_svr( adata, layers=[layer], @@ -438,8 +459,6 @@ def select_genes_monocle( adata._inplace_subset_var(filter_bool) adata.var["use_for_pca"] = True - # return filter_bool if only_bools else adata - def select_genes_by_svr( adata_ori: AnnData, @@ -458,19 +477,7 @@ def select_genes_by_svr( filter_bool: A boolean array from the user to select genes for downstream analysis. Defaults to None. layers: The layer(s) to be used for calculating dispersion score via support vector regression (SVR). Defaults to "X". - relative_expr: A logic flag to determine whether we need to divide gene expression values first by size factor - before run SVR. Defaults to True. - total_szfactor: The column name in the .obs attribute that corresponds to the size factor for the total mRNA. - Defaults to "total_Size_Factor". - min_expr_cells: minimum number of cells that express the gene for it to be considered in the fit. Defaults to 0. - min_expr_avg: The minimum average of genes across cells required for gene to be selected for SVR analyses. - Defaults to 0. - max_expr_avg: The maximum average of genes across cells required for gene to be selected for SVR analyses. Genes - with average gene expression larger than this value will be treated as house-keeping/outlier genes. Defaults - to np.inf. - svr_gamma: the gamma hyper-parameter of the SVR. Defaults to None. - winsorize: Weather to winsorize the data for the cv vs mean model. Defaults to False. - winsor_perc: the up and lower bound of the winsorization. Defaults to (1, 99.5). + algorithm: sort_inverse: whether to sort genes from less noisy to more noisy (to use for size estimation not for feature selection). Defaults to False. use_all_genes_cells: A logic flag to determine whether all cells and genes should be used for the size factor @@ -552,6 +559,29 @@ def get_vaild_CM( winsorize: bool = False, winsor_perc: Tuple[float, float] = (1, 99.5), ): + """Find a valid CM that is the data of the layer corresponding to the size factor. + + Args: + adata: an AnnData object. + layer: The data from a particular layer (include X) used for feature selection. Defaults to "X". + relative_expr: A logic flag to determine whether we need to divide gene expression values first by size factor + before run SVR. Defaults to True. + total_szfactor: The column name in the .obs attribute that corresponds to the size factor for the total mRNA. + Defaults to "total_Size_Factor". + min_expr_cells: minimum number of cells that express the gene for it to be considered in the fit. Defaults to 0. + min_expr_avg: The minimum average of genes across cells required for gene to be selected for SVR analyses. + Defaults to 0. + max_expr_avg: The maximum average of genes across cells required for gene to be selected for SVR analyses. Genes + with average gene expression larger than this value will be treated as house-keeping/outlier genes. Defaults + to np.inf. + winsorize: Weather to winsorize the data for the cv vs mean model. Defaults to False. + winsor_perc: the up and lower bound of the winsorization. Defaults to (1, 99.5). + + Returns: + An updated annData object with `log_m`, `log_cv`, `score` added to .obs columns and `SVR` added to uns attribute + as a new key. + """ + CM = None if layer == "raw": CM = adata.X.copy() if adata.raw is None else adata.raw @@ -596,6 +626,19 @@ def get_vaild_CM( def get_ground_target(algorithm, valid_CM, winsorize, winsor_perc) -> AnnData: + """Find the training and target dataset to perform a base class for estimators that use libsvm as backing library. + + Args: + algorithm: Method of calculating mean and coefficient of variation, either fano_dispersion or cv_dispersion. + valid_CM: Gene expression matrix. + winsorize: Weather to winsorize the data for the cv vs mean model. Defaults to False. + winsor_perc: the up and lower bound of the winsorization. Defaults to (1, 99.5). + + Returns: + ground: the training array dataset that contains mean values of gene expression. + target: the target array dataset with coefficient of variation of gene expression. + """ + if algorithm == "fano_dispersion": (gene_counts_stats, gene_fano_parameters) = get_highvar_genes_sparse(valid_CM) ground = np.array(gene_counts_stats["mean"]).flatten()[:, None] @@ -620,7 +663,7 @@ def get_ground_target(algorithm, valid_CM, winsorize, winsor_perc) -> AnnData: sigma = ( np.array( np.sqrt( - (valid_CM.multiply(valid_CM).mean(0).A1 - (mu) ** 2) + (valid_CM.multiply(valid_CM).mean(0).A1 - mu**2) # * (adata.n_obs) # / (adata.n_obs - 1) ) @@ -641,7 +684,18 @@ def get_ground_target(algorithm, valid_CM, winsorize, winsor_perc) -> AnnData: return ground, target, mu -def get_prediction_by_svr(ground, target, mean, svr_gamma): +def get_prediction_by_svr(ground: np.ndarray, target: np.ndarray, mean: np.ndarray, svr_gamma: Optional[float] = None): + """This function will return the base class for estimators that use libsvm as backing library. + + Args: + ground: the training array dataset that contains mean values of gene expression. + target: the target array dataset with coefficient of variation of gene expression. + mean: the mean value to estimate a value of svr_gamma. + svr_gamma: the gamma hyperparameter of the SVR. Defaults to None. + + Returns: + A fitted SVM model according to the given training and target data. + """ from sklearn.svm import SVR if svr_gamma is None: @@ -732,7 +786,7 @@ def get_highvar_genes_sparse( "T": T, "minimal_mean": minimal_mean, } - return (gene_counts_stats, gene_fano_parameters) + return gene_counts_stats, gene_fano_parameters def select_genes_by_seurat_recipe( @@ -795,7 +849,6 @@ def select_genes_by_seurat_recipe( if algorithm == "seurat_dispersion": mean, variance, highly_variable_mask = select_genes_by_seurat_dispersion( - subset_adata, layer_mat, min_disp=seurat_min_disp, max_disp=seurat_max_disp, @@ -836,7 +889,6 @@ def select_genes_by_seurat_recipe( def select_genes_by_seurat_dispersion( - adata: AnnData, sparse_layer_mat: csr_matrix, n_bins: int = 20, log_mean_and_dispersion: bool = True, @@ -845,11 +897,10 @@ def select_genes_by_seurat_dispersion( min_mean: float = None, max_mean: float = None, n_top_genes: Union[int, None] = None, -) -> Tuple[np.ndarray, np.ndarray, np.ndarray, None]: +) -> Tuple[ndarray, ndarray, Union[bool, ndarray]]: """Apply seurat's gene selection recipe by cutoffs. Args: - adata: an AnnData object sparse_layer_mat: the sparse matrix used for gene selection. n_bins: the number of bins for normalization. Defaults to 20. log_mean_and_dispersion: whether log the gene expression values before calculating the dispersion values. @@ -911,7 +962,7 @@ def select_genes_by_seurat_dispersion( highly_variable_mask = None if n_top_genes is not None: - main_info("choose %d top genes" % (n_top_genes), indent_level=2) + main_info("choose %d top genes" % n_top_genes, indent_level=2) threshold = temp_df["dispersion_norm"].nlargest(n_top_genes).values[-1] highly_variable_mask = temp_df["dispersion_norm"].values >= threshold else: @@ -932,7 +983,7 @@ def get_highly_variable_mask_by_dispersion_svr( mean: np.ndarray, var: np.ndarray, n_top_genes: int, - svr_gamma: Union[float, None] = None, + svr_gamma: Optional[float] = None, return_scores: bool = True, ) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray]: """Returns the mask with shape same as mean and var. @@ -944,7 +995,7 @@ def get_highly_variable_mask_by_dispersion_svr( var: variance of the genes. n_top_genes: the number of top genes to be inspected. svr_gamma: coefficient for support vector regression. Defaults to None. - return_scores: whether returen the dispersion scores. Defaults to True. + return_scores: whether return the dispersion scores. Defaults to True. Returns: A tuple (highly_variable_mask, scores) where highly_variable_mask is a bool array indicating whether an element @@ -965,8 +1016,8 @@ def get_highly_variable_mask_by_dispersion_svr( if np.sum(is_nan_indices) > 0: main_warning( ( - "mean and cv_log contain NAN values. We exclude them in SVR training. Please use related gene filtering " - "methods to filter genes with zero means." + "mean and cv_log contain NAN values. We exclude them in SVR training. Please use related gene filtering" + " methods to filter genes with zero means." ) ) diff --git a/dynamo/preprocessing/preprocess.py b/dynamo/preprocessing/preprocess.py index 412b2f1c3..c750298a8 100755 --- a/dynamo/preprocessing/preprocess.py +++ b/dynamo/preprocessing/preprocess.py @@ -307,76 +307,6 @@ def normalize_cell_expr_by_size_factors_legacy( return adata -def Gini(adata: anndata.AnnData, layers: Union[Literal["all"], List[str]] = "all") -> anndata.AnnData: - """Calculate the Gini coefficient of a numpy array. - https://github.com/thomasmaxwellnorman/perturbseq_demo/blob/master/perturbseq/util.py - - Args: - adata: an AnnData object - layers: the layer(s) to be normalized. Defaults to "all". - - Returns: - An updated anndata object with gini score for the layers (include .X) in the corresponding var columns - (layer + '_gini'). - """ - - # From: https://github.com/oliviaguest/gini - # based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif - # from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm - - layers = DynamoAdataKeyManager.get_available_layer_keys(adata, layers) - - def compute_gini(CM): - # convert to dense array if sparse - if issparse(CM): - CM = CM.A - - # shift all values to be non-negative - CM -= np.min(CM) - - # add small constant to avoid zeros - CM = CM.astype(float) + 0.0000001 # values cannot be 0 - - # sort values along axis 0 - CM = np.sort(CM, axis=0) - - # compute index array - n = CM.shape[0] - index = np.arange(1, n + 1) - - # compute numerator and denominator of Gini coefficient - test = 2 * index - n - 1 - numerator = np.sum(test[:, np.newaxis] * CM, axis=0) - denominator = n * np.sum(CM, axis=0) - - # compute Gini coefficient for each feature - gini = numerator / denominator - - return gini - - for layer in layers: - if layer == "raw": - CM = adata.raw.X - elif layer == "X": - CM = adata.X - elif layer == "protein": - if "protein" in adata.obsm_keys(): - CM = adata.obsm[layer] - else: - continue - else: - CM = adata.layers[layer] - - var_gini = compute_gini(CM) - - if layer in ["raw", "X"]: - adata.var["gini"] = var_gini - else: - adata.var[layer + "_gini"] = var_gini - - return adata - - def disp_calc_helper_NB( adata: anndata.AnnData, layers: str = "X", min_cells_detected: int = 1 ) -> Tuple[List[str], List[pd.DataFrame]]: From a4dc64842cf72a0fe990d0b2c70d2f2cdb9a83a9 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Thu, 6 Apr 2023 16:23:23 -0400 Subject: [PATCH 07/28] fix function name get_mean_cv and added test --- dynamo/preprocessing/Preprocessor.py | 2 +- dynamo/preprocessing/gene_selection.py | 45 +++++++++-------- tests/test_preprocess.py | 69 +++++++++++++++++++++++++- 3 files changed, 93 insertions(+), 23 deletions(-) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index 6ae1d1aa5..6c8b60696 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -88,7 +88,7 @@ def __init__( normalize_cell_expr_by_size_factors. normalize_by_cells_function_kwargs: arguments that will be passed to normalize_by_cells_function. Defaults to {}. - select_genes_function: function for selecting gene features. Defaults to select_genes_by_dispersion_general. + select_genes_function: function for selecting gene features. Defaults to select_genes_monocle. select_genes_kwargs: arguments that will be passed to select_genes. Defaults to {}. normalize_selected_genes_function: function for normalize selected genes. Defaults to None. normalize_selected_genes_kwargs: arguments that will be passed to normalize_selected_genes. Defaults to {}. diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index a02b4e935..67b712cb1 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -477,7 +477,7 @@ def select_genes_by_svr( filter_bool: A boolean array from the user to select genes for downstream analysis. Defaults to None. layers: The layer(s) to be used for calculating dispersion score via support vector regression (SVR). Defaults to "X". - algorithm: + algorithm: Select a method to calculate the dispersion of genes, whether "cv_dispersion" or "fano_dispersion" sort_inverse: whether to sort genes from less noisy to more noisy (to use for size estimation not for feature selection). Defaults to False. use_all_genes_cells: A logic flag to determine whether all cells and genes should be used for the size factor @@ -517,9 +517,9 @@ def select_genes_by_svr( if valid_CM is None: continue - ground, target, mean = get_ground_target(algorithm, valid_CM, winsorize, winsor_perc) - fitted_fun = get_prediction_by_svr(ground, target, mean, svr_gamma) - score = target - fitted_fun(ground) + mean, cv = get_mean_cv(valid_CM, algorithm, winsorize, winsor_perc) + fitted_fun = get_prediction_by_svr(mean, cv, svr_gamma) + score = cv - fitted_fun(mean) if sort_inverse: score = -score @@ -538,8 +538,8 @@ def select_genes_by_svr( adata.var.loc[detected_bool, prefix + "log_cv"], adata.var.loc[detected_bool, prefix + "score"], ) = ( - np.array(ground).flatten(), - np.array(target).flatten(), + np.array(mean).flatten(), + np.array(cv).flatten(), np.array(score).flatten(), ) @@ -625,25 +625,31 @@ def get_vaild_CM( return CM[:, detected_bool], detected_bool -def get_ground_target(algorithm, valid_CM, winsorize, winsor_perc) -> AnnData: - """Find the training and target dataset to perform a base class for estimators that use libsvm as backing library. +def get_mean_cv( + valid_CM: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix, scipy.sparse.coo_matrix], + algorithm: Literal["cv_dispersion", "fano_dispersion"] = "cv_dispersion", + winsorize: bool = False, + winsor_perc: Tuple[float, float] = (1, 99.5), +) -> AnnData: + """Find the mean and coefficient of variation of gene expression. Args: algorithm: Method of calculating mean and coefficient of variation, either fano_dispersion or cv_dispersion. - valid_CM: Gene expression matrix. - winsorize: Weather to winsorize the data for the cv vs mean model. Defaults to False. - winsor_perc: the up and lower bound of the winsorization. Defaults to (1, 99.5). + valid_CM: Gene expression matrix to be used in a downstream analysis. + winsorize: Whether to winsorize the data for the cv vs mean model. Defaults to False. + winsor_perc: The up and lower bound of the winsorization. Defaults to (1, 99.5). Returns: - ground: the training array dataset that contains mean values of gene expression. - target: the target array dataset with coefficient of variation of gene expression. + mean: the array dataset that contains mean values of gene expression. + cv: the array dataset with coefficient of variation of gene expression. """ if algorithm == "fano_dispersion": (gene_counts_stats, gene_fano_parameters) = get_highvar_genes_sparse(valid_CM) - ground = np.array(gene_counts_stats["mean"]).flatten()[:, None] - target = np.array(gene_counts_stats["fano"]).flatten() + mean = np.array(gene_counts_stats["mean"]).flatten()[:, None] + cv = np.array(gene_counts_stats["fano"]).flatten() mu = gene_counts_stats["mean"] + return mean, cv elif algorithm == "cv_dispersion": if winsorize: down, up = ( @@ -676,15 +682,12 @@ def get_ground_target(algorithm, valid_CM, winsorize, winsor_perc) -> AnnData: log_m = np.array(np.log2(mu)).flatten() log_cv = np.array(np.log2(cv)).flatten() log_m[mu == 0], log_cv[mu == 0] = 0, 0 - ground = log_m[:, None] - target = log_cv + return log_m[:, None], log_cv else: raise ValueError(f"The algorithm {algorithm} is not existed") - return ground, target, mu - -def get_prediction_by_svr(ground: np.ndarray, target: np.ndarray, mean: np.ndarray, svr_gamma: Optional[float] = None): +def get_prediction_by_svr(ground: np.ndarray, target: np.ndarray, svr_gamma: Optional[float] = None): """This function will return the base class for estimators that use libsvm as backing library. Args: @@ -699,7 +702,7 @@ def get_prediction_by_svr(ground: np.ndarray, target: np.ndarray, mean: np.ndarr from sklearn.svm import SVR if svr_gamma is None: - svr_gamma = 150.0 / len(mean) + svr_gamma = 150.0 / len(ground) # Fit the Support Vector Regression clf = SVR(gamma=svr_gamma) diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py index 0f6842aef..342cb94fa 100644 --- a/tests/test_preprocess.py +++ b/tests/test_preprocess.py @@ -1,3 +1,5 @@ +import timeit + import anndata import numpy as np import pandas as pd @@ -17,7 +19,6 @@ is_nonnegative, is_nonnegative_integer_arr, log1p_adata, - select_genes_by_dispersion_general, ) from dynamo.preprocessing.utils import convert_layers2csr @@ -209,6 +210,71 @@ def test_is_nonnegative(): assert not is_nonnegative_integer_arr(test_mat) +def test_gene_selection_method(): + adata = dyn.sample_data.zebrafish() + dyn.pl.basic_stats(adata) + dyn.pl.highest_frac_genes(adata) + + # Drawing for the downstream analysis. + # df = adata.obs.loc[:, ["nCounts", "pMito", "nGenes"]] + # g = sns.PairGrid(df, y_vars=["pMito", "nGenes"], x_vars=["nCounts"], height=4) + # g.map(sns.regplot, color=".3") + # # g.set(ylim=(-1, 11), yticks=[0, 5, 10]) + # g.add_legend() + # plt.show() + + bdata = adata.copy() + cdata = adata.copy() + ddata = adata.copy() + edata = adata.copy() + preprocessor = Preprocessor() + + starttime = timeit.default_timer() + preprocessor.preprocess_adata(edata, recipe="monocle", gene_selection_method="gini") + monocle_gini_result = edata.var.use_for_pca + + preprocessor.preprocess_adata(adata, recipe="monocle", gene_selection_method="cv_dispersion") + monocle_cv_dispersion_result_1 = adata.var.use_for_pca + + preprocessor.preprocess_adata(bdata, recipe="monocle", gene_selection_method="fano_dispersion") + monocle_fano_dispersion_result_2 = bdata.var.use_for_pca + + preprocessor.preprocess_adata(cdata, recipe="seurat", gene_selection_method="fano_dispersion") + seurat_fano_dispersion_result_3 = cdata.var.use_for_pca + + preprocessor.preprocess_adata(ddata, recipe="seurat", gene_selection_method="seurat_dispersion") + seurat_seurat_dispersion_result_4 = ddata.var.use_for_pca + + diff_count = sum(1 for x, y in zip(monocle_cv_dispersion_result_1, monocle_gini_result) if x != y) + print(diff_count / len(monocle_cv_dispersion_result_1) * 100) + + diff_count = sum(1 for x, y in zip(monocle_cv_dispersion_result_1, monocle_fano_dispersion_result_2) if x != y) + print(diff_count / len(monocle_cv_dispersion_result_1) * 100) + + diff_count = sum(1 for x, y in zip(monocle_fano_dispersion_result_2, seurat_fano_dispersion_result_3) if x != y) + print(diff_count / len(monocle_fano_dispersion_result_2) * 100) + + diff_count = sum(1 for x, y in zip(seurat_fano_dispersion_result_3, seurat_seurat_dispersion_result_4) if x != y) + print(diff_count / len(seurat_fano_dispersion_result_3) * 100) + + diff_count = sum(1 for x, y in zip(monocle_cv_dispersion_result_1, seurat_seurat_dispersion_result_4) if x != y) + print(diff_count / len(monocle_cv_dispersion_result_1) * 100) + + print("The preprocess_adata() time difference is :", timeit.default_timer() - starttime) + + +def test_regress_out(): + adata = dyn.sample_data.hematopoiesis_raw() + dyn.pl.basic_stats(adata) + dyn.pl.highest_frac_genes(adata) + + preprocessor = Preprocessor() + + starttime = timeit.default_timer() + preprocessor.preprocess_adata(adata, recipe="monocle", regress_out=["nCounts", "Dummy", "Test", "pMito"]) + print("The preprocess_adata() time difference is :", timeit.default_timer() - starttime) + + if __name__ == "__main__": # test_is_nonnegative() @@ -228,4 +294,5 @@ def test_is_nonnegative(): # test_highest_frac_genes_plot(adata.copy()) # test_highest_frac_genes_plot_prefix_list(adata.copy()) # test_recipe_monocle_feature_selection_layer_simple0() + # test_gene_selection_method() pass From 7f837a2d44233b195126c235589deecd9df7cf0e Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Thu, 6 Apr 2023 16:43:33 -0400 Subject: [PATCH 08/28] docstring --- dynamo/preprocessing/gene_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 67b712cb1..b5f963612 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -477,7 +477,7 @@ def select_genes_by_svr( filter_bool: A boolean array from the user to select genes for downstream analysis. Defaults to None. layers: The layer(s) to be used for calculating dispersion score via support vector regression (SVR). Defaults to "X". - algorithm: Select a method to calculate the dispersion of genes, whether "cv_dispersion" or "fano_dispersion" + algorithm: Method of calculating mean and coefficient of variation, either "cv_dispersion" or "fano_dispersion" sort_inverse: whether to sort genes from less noisy to more noisy (to use for size estimation not for feature selection). Defaults to False. use_all_genes_cells: A logic flag to determine whether all cells and genes should be used for the size factor From dfb5866aeaf16effec30e3c578da893c7de9c762 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Thu, 6 Apr 2023 16:43:33 -0400 Subject: [PATCH 09/28] docstring --- dynamo/preprocessing/gene_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 67b712cb1..b5f963612 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -477,7 +477,7 @@ def select_genes_by_svr( filter_bool: A boolean array from the user to select genes for downstream analysis. Defaults to None. layers: The layer(s) to be used for calculating dispersion score via support vector regression (SVR). Defaults to "X". - algorithm: Select a method to calculate the dispersion of genes, whether "cv_dispersion" or "fano_dispersion" + algorithm: Method of calculating mean and coefficient of variation, either "cv_dispersion" or "fano_dispersion" sort_inverse: whether to sort genes from less noisy to more noisy (to use for size estimation not for feature selection). Defaults to False. use_all_genes_cells: A logic flag to determine whether all cells and genes should be used for the size factor From 0e0842427b77dbbbeac546e43ef31b7dd4002a98 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Fri, 7 Apr 2023 20:09:07 -0400 Subject: [PATCH 10/28] add an warning msg for the dispersion mode --- dynamo/preprocessing/gene_selection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index b5f963612..ab1dc1316 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -347,6 +347,7 @@ def top_table(adata: AnnData, layer: str = "X", mode: Literal["dispersion", "gin if mode == "dispersion": if adata.uns[key] is None: + main_warning("dispersion mode is deprecated. This mode will be removed in the future.") estimate_dispersion(adata, layers=[layer]) if adata.uns[key] is None: From 89089c6a4fb84828958b3d1b5fe08d22c3205de5 Mon Sep 17 00:00:00 2001 From: LoveLennone <117324201+LoveLennone@users.noreply.github.com> Date: Sat, 8 Apr 2023 22:07:21 -0400 Subject: [PATCH 11/28] deprecated functions --- dynamo/preprocessing/__init__.py | 2 - dynamo/preprocessing/_deprecated.py | 239 +++++++++++++++++++++++++ dynamo/preprocessing/gene_selection.py | 238 +----------------------- tests/test_preprocess.py | 2 +- 4 files changed, 246 insertions(+), 235 deletions(-) create mode 100644 dynamo/preprocessing/_deprecated.py diff --git a/dynamo/preprocessing/__init__.py b/dynamo/preprocessing/__init__.py index 3d7d1e80c..4b28794f0 100755 --- a/dynamo/preprocessing/__init__.py +++ b/dynamo/preprocessing/__init__.py @@ -37,7 +37,6 @@ from .CnmfPreprocessor import CnmfPreprocessor from .gene_selection import ( Gini, - estimate_dispersion, select_genes_by_svr, select_genes_monocle, top_table, @@ -56,7 +55,6 @@ "recipe_velocyto", "Gini", "top_table", - "estimate_dispersion", "filter_cells_by_outliers", "select_genes_monocle", "filter_genes", diff --git a/dynamo/preprocessing/_deprecated.py b/dynamo/preprocessing/_deprecated.py new file mode 100644 index 000000000..3f5e4c801 --- /dev/null +++ b/dynamo/preprocessing/_deprecated.py @@ -0,0 +1,239 @@ +from typing import Dict, List, Optional, Tuple, Union +from anndata import AnnData +import numpy as np +import pandas as pd +import statsmodels.api as sm +import re +from ..configuration import DKM +from ..dynamo_logger import ( + LoggerManager, + main_debug, + main_info, + main_warning, +) +from scipy.sparse import csr_matrix, issparse +from .utils import cook_dist + + +def _disp_calc_helper_NB( + adata: AnnData, layers: str = "X", min_cells_detected: int = 1 +) -> Tuple[List[str], List[pd.DataFrame]]: + """Helper function to calculate the dispersion parameter. + + This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). + + Args: + adata: an Anndata object. + layers: the layer of data used for dispersion fitting. Defaults to "X". + min_cells_detected: the minimal required number of cells with expression for selecting gene for dispersion + fitting. Defaults to 1. + + Returns: + layers: a list of layers available. + res_list: a list of pd.DataFrames with mu, dispersion for each gene that passes filters. + """ + main_warning(__name__ + " is deprecated.") + layers = DKM.get_available_layer_keys(adata, layers=layers, include_protein=False) + + res_list = [] + for layer in layers: + if layer == "raw": + CM = adata.raw.X + szfactors = adata.obs[layer + "Size_Factor"][:, None] + elif layer == "X": + CM = adata.X + szfactors = adata.obs["Size_Factor"][:, None] + else: + CM = adata.layers[layer] + szfactors = adata.obs[layer + "Size_Factor"][:, None] + + if issparse(CM): + CM.data = np.round(CM.data, 0) + rounded = CM + else: + rounded = CM.round().astype("int") + + lowerDetectedLimit = adata.uns["lowerDetectedLimit"] if "lowerDetectedLimit" in adata.uns.keys() else 1 + nzGenes = (rounded > lowerDetectedLimit).sum(axis=0) + nzGenes = nzGenes > min_cells_detected + + nzGenes = nzGenes.A1 if issparse(rounded) else nzGenes + if layer.startswith("X_"): + x = rounded[:, nzGenes] + else: + x = ( + rounded[:, nzGenes].multiply(csr_matrix(1 / szfactors)) + if issparse(rounded) + else rounded[:, nzGenes] / szfactors + ) + + xim = np.mean(1 / szfactors) if szfactors is not None else 1 + + f_expression_mean = x.mean(axis=0) + + # For NB: Var(Y) = mu * (1 + mu / k) + # x.A.var(axis=0, ddof=1) + f_expression_var = ( + (x.multiply(x).mean(0).A1 - f_expression_mean.A1**2) * x.shape[0] / (x.shape[0] - 1) + if issparse(x) + else x.var(axis=0, ddof=0) ** 2 + ) # np.mean(np.power(x - f_expression_mean, 2), axis=0) # variance with n - 1 + # https://scialert.net/fulltext/?doi=ajms.2010.1.15 method of moments + disp_guess_meth_moments = f_expression_var - xim * f_expression_mean # variance - mu + + disp_guess_meth_moments = disp_guess_meth_moments / np.power( + f_expression_mean, 2 + ) # this is dispersion parameter (1/k) + + res = pd.DataFrame( + { + "mu": np.array(f_expression_mean).flatten(), + "disp": np.array(disp_guess_meth_moments).flatten(), + } + ) + res.loc[res["mu"] == 0, "mu"] = None + res.loc[res["mu"] == 0, "disp"] = None + res.loc[res["disp"] < 0, "disp"] = 0 + + res["gene_id"] = adata.var_names[nzGenes] + + res_list.append(res) + + return layers, res_list + + +def _parametric_dispersion_fit( + disp_table: pd.DataFrame, initial_coefs: np.ndarray = np.array([1e-6, 1]) +) -> Tuple[sm.formula.glm, np.ndarray, pd.DataFrame]: + """Perform the dispersion parameter fitting with initial guesses of coefficients. + + This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). + + Args: + disp_table: A pandas dataframe with mu, dispersion for each gene that passes filters. + initial_coefs: Initial parameters for the gamma fit of the dispersion parameters. Defaults to + np.array([1e-6, 1]). + + Returns: + A tuple (fit, coefs, good), where fit is a statsmodels fitting object, coefs contains the two resulting gamma + fitting coefficient, and good is the subsetted dispersion table that is subjected to Gamma fitting. + """ + main_warning(__name__ + " is deprecated.") + coefs = initial_coefs + iter = 0 + while True: + residuals = disp_table["disp"] / (coefs[0] + coefs[1] / disp_table["mu"]) + good = disp_table.loc[(residuals > initial_coefs[0]) & (residuals < 10000), :] + # https://stats.stackexchange.com/questions/356053/the-identity-link-function-does-not-respect-the-domain-of-the + # -gamma-family + fit = sm.formula.glm( + "disp ~ I(1 / mu)", + data=good, + family=sm.families.Gamma(link=sm.genmod.families.links.identity), + ).train(start_params=coefs) + + oldcoefs = coefs + coefs = fit.params + + if coefs[0] < initial_coefs[0]: + coefs[0] = initial_coefs[0] + if coefs[1] < 0: + main_warning("Parametric dispersion fit may be failed.") + + if np.sum(np.log(coefs / oldcoefs) ** 2 < coefs[0]): + break + iter += 1 + + if iter > 10: + main_warning("Dispersion fit didn't converge") + break + if not np.all(coefs > 0): + main_warning("Parametric dispersion fit may be failed.") + + return fit, coefs, good + + +def _estimate_dispersion( + adata: AnnData, + layers: str = "X", + modelFormulaStr: str = "~ 1", + min_cells_detected: int = 1, + removeOutliers: bool = False, +) -> AnnData: + """This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). + + Args: + adata: an AnnData object. + layers: the layer(s) to be used for calculating dispersion. Default is "X" if there is no spliced layers. + modelFormulaStr: the model formula used to calculate dispersion parameters. Not used. Defaults to "~ 1". + min_cells_detected: the minimum number of cells detected for calculating the dispersion. Defaults to 1. + removeOutliers: whether to remove outliers when performing dispersion fitting. Defaults to False. + + Raises: + Exception: there is no valid DataFrames with mu for genes. + + Returns: + An updated annData object with dispFitInfo added to uns attribute as a new key. + """ + main_warning(__name__ + " is deprecated.") + logger = LoggerManager.gen_logger("dynamo-preprocessing") + # mu = None + model_terms = [x.strip() for x in re.compile("~|\\*|\\+").split(modelFormulaStr)] + model_terms = list(set(model_terms) - set([""])) + + cds_pdata = adata.obs # .loc[:, model_terms] + cds_pdata["rowname"] = cds_pdata.index.values + layers, disp_tables = _disp_calc_helper_NB(adata[:, :], layers, min_cells_detected) + # disp_table['disp'] = np.random.uniform(0, 10, 11) + # disp_table = cds_pdata.apply(disp_calc_helper_NB(adata[:, :], min_cells_detected)) + + # cds_pdata <- dplyr::group_by_(dplyr::select_(rownames_to_column(pData(cds)), "rowname", .dots=model_terms), .dots + # =model_terms) + # disp_table <- as.data.frame(cds_pdata %>% do(disp_calc_helper_NB(cds[,.$rowname], cds@expressionFamily, min_cells_ + # detected))) + for ind in range(len(layers)): + layer, disp_table = layers[ind], disp_tables[ind] + + if disp_table is None: + raise Exception("Parametric dispersion fitting failed, please set a different lowerDetectionLimit") + + disp_table = disp_table.loc[np.where(disp_table["mu"] != np.nan)[0], :] + + res = _parametric_dispersion_fit(disp_table) + fit, coefs, good = res[0], res[1], res[2] + + if removeOutliers: + # influence = fit.get_influence().cooks_distance() + # #CD is the distance and p is p-value + # (CD, p) = influence.cooks_distance + + CD = cook_dist(fit, 1 / good["mu"][:, None], good) + cooksCutoff = 4 / good.shape[0] + main_info("Removing " + str(len(CD[CD > cooksCutoff])) + " outliers") + outliers = CD > cooksCutoff + # use CD.index.values? remove genes that lost when doing parameter fitting + lost_gene = set(good.index.values).difference(set(range(len(CD)))) + outliers[lost_gene] = True + res = _parametric_dispersion_fit(good.loc[~outliers, :]) + + fit, coefs = res[0], res[1] + + def ans(q): + return coefs[0] + coefs[1] / q + + if layer == "X": + logger.info_insert_adata("dispFitInfo", "uns") + adata.uns["dispFitInfo"] = { + "disp_table": good, + "disp_func": ans, + "coefs": coefs, + } + else: + logger.info_insert_adata(layer + "_dispFitInfo", "uns") + adata.uns[layer + "_dispFitInfo"] = { + "disp_table": good, + "disp_func": ans, + "coefs": coefs, + } + + return adata \ No newline at end of file diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index ab1dc1316..9c6e51171 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -1,6 +1,5 @@ -import re import warnings -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union from numpy import ndarray @@ -12,7 +11,6 @@ import numpy as np import pandas as pd import scipy.sparse -import statsmodels.api as sm from anndata import AnnData from scipy.sparse import csr_matrix, issparse @@ -30,7 +28,8 @@ get_svr_filter, seurat_get_mean_var, ) -from .utils import compute_gene_exp_fraction, cook_dist, merge_adata_attrs +from .utils import compute_gene_exp_fraction, merge_adata_attrs +from ._deprecated import _estimate_dispersion def Gini(adata: AnnData, layers: Union[Literal["all"], List[str]] = "all") -> AnnData: @@ -94,229 +93,6 @@ def compute_gini(CM): return adata -def parametric_dispersion_fit( - disp_table: pd.DataFrame, initial_coefs: np.ndarray = np.array([1e-6, 1]) -) -> Tuple[sm.formula.glm, np.ndarray, pd.DataFrame]: - """Perform the dispersion parameter fitting with initial guesses of coefficients. - - This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). - - Args: - disp_table: A pandas dataframe with mu, dispersion for each gene that passes filters. - initial_coefs: Initial parameters for the gamma fit of the dispersion parameters. Defaults to - np.array([1e-6, 1]). - - Returns: - A tuple (fit, coefs, good), where fit is a statsmodels fitting object, coefs contains the two resulting gamma - fitting coefficient, and good is the subsetted dispersion table that is subjected to Gamma fitting. - """ - - coefs = initial_coefs - iter = 0 - while True: - residuals = disp_table["disp"] / (coefs[0] + coefs[1] / disp_table["mu"]) - good = disp_table.loc[(residuals > initial_coefs[0]) & (residuals < 10000), :] - # https://stats.stackexchange.com/questions/356053/the-identity-link-function-does-not-respect-the-domain-of-the - # -gamma-family - fit = sm.formula.glm( - "disp ~ I(1 / mu)", - data=good, - family=sm.families.Gamma(link=sm.genmod.families.links.identity), - ).train(start_params=coefs) - - oldcoefs = coefs - coefs = fit.params - - if coefs[0] < initial_coefs[0]: - coefs[0] = initial_coefs[0] - if coefs[1] < 0: - main_warning("Parametric dispersion fit may be failed.") - - if np.sum(np.log(coefs / oldcoefs) ** 2 < coefs[0]): - break - iter += 1 - - if iter > 10: - main_warning("Dispersion fit didn't converge") - break - if not np.all(coefs > 0): - main_warning("Parametric dispersion fit may be failed.") - - return fit, coefs, good - - -def disp_calc_helper_NB( - adata: AnnData, layers: str = "X", min_cells_detected: int = 1 -) -> Tuple[List[str], List[pd.DataFrame]]: - """Helper function to calculate the dispersion parameter. - - This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). - - Args: - adata: an Anndata object. - layers: the layer of data used for dispersion fitting. Defaults to "X". - min_cells_detected: the minimal required number of cells with expression for selecting gene for dispersion - fitting. Defaults to 1. - - Returns: - layers: a list of layers available. - res_list: a list of pd.DataFrames with mu, dispersion for each gene that passes filters. - """ - layers = DKM.get_available_layer_keys(adata, layers=layers, include_protein=False) - - res_list = [] - for layer in layers: - if layer == "raw": - CM = adata.raw.X - szfactors = adata.obs[layer + "Size_Factor"][:, None] - elif layer == "X": - CM = adata.X - szfactors = adata.obs["Size_Factor"][:, None] - else: - CM = adata.layers[layer] - szfactors = adata.obs[layer + "Size_Factor"][:, None] - - if issparse(CM): - CM.data = np.round(CM.data, 0) - rounded = CM - else: - rounded = CM.round().astype("int") - - lowerDetectedLimit = adata.uns["lowerDetectedLimit"] if "lowerDetectedLimit" in adata.uns.keys() else 1 - nzGenes = (rounded > lowerDetectedLimit).sum(axis=0) - nzGenes = nzGenes > min_cells_detected - - nzGenes = nzGenes.A1 if issparse(rounded) else nzGenes - if layer.startswith("X_"): - x = rounded[:, nzGenes] - else: - x = ( - rounded[:, nzGenes].multiply(csr_matrix(1 / szfactors)) - if issparse(rounded) - else rounded[:, nzGenes] / szfactors - ) - - xim = np.mean(1 / szfactors) if szfactors is not None else 1 - - f_expression_mean = x.mean(axis=0) - - # For NB: Var(Y) = mu * (1 + mu / k) - # x.A.var(axis=0, ddof=1) - f_expression_var = ( - (x.multiply(x).mean(0).A1 - f_expression_mean.A1**2) * x.shape[0] / (x.shape[0] - 1) - if issparse(x) - else x.var(axis=0, ddof=0) ** 2 - ) # np.mean(np.power(x - f_expression_mean, 2), axis=0) # variance with n - 1 - # https://scialert.net/fulltext/?doi=ajms.2010.1.15 method of moments - disp_guess_meth_moments = f_expression_var - xim * f_expression_mean # variance - mu - - disp_guess_meth_moments = disp_guess_meth_moments / np.power( - f_expression_mean, 2 - ) # this is dispersion parameter (1/k) - - res = pd.DataFrame( - { - "mu": np.array(f_expression_mean).flatten(), - "disp": np.array(disp_guess_meth_moments).flatten(), - } - ) - res.loc[res["mu"] == 0, "mu"] = None - res.loc[res["mu"] == 0, "disp"] = None - res.loc[res["disp"] < 0, "disp"] = 0 - - res["gene_id"] = adata.var_names[nzGenes] - - res_list.append(res) - - return layers, res_list - - -def estimate_dispersion( - adata: AnnData, - layers: str = "X", - modelFormulaStr: str = "~ 1", - min_cells_detected: int = 1, - removeOutliers: bool = False, -) -> AnnData: - """This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). - - Args: - adata: an AnnData object. - layers: the layer(s) to be used for calculating dispersion. Default is "X" if there is no spliced layers. - modelFormulaStr: the model formula used to calculate dispersion parameters. Not used. Defaults to "~ 1". - min_cells_detected: the minimum number of cells detected for calculating the dispersion. Defaults to 1. - removeOutliers: whether to remove outliers when performing dispersion fitting. Defaults to False. - - Raises: - Exception: there is no valid DataFrames with mu for genes. - - Returns: - An updated annData object with dispFitInfo added to uns attribute as a new key. - """ - - logger = LoggerManager.gen_logger("dynamo-preprocessing") - # mu = None - model_terms = [x.strip() for x in re.compile("~|\\*|\\+").split(modelFormulaStr)] - model_terms = list(set(model_terms) - set([""])) - - cds_pdata = adata.obs # .loc[:, model_terms] - cds_pdata["rowname"] = cds_pdata.index.values - layers, disp_tables = disp_calc_helper_NB(adata[:, :], layers, min_cells_detected) - # disp_table['disp'] = np.random.uniform(0, 10, 11) - # disp_table = cds_pdata.apply(disp_calc_helper_NB(adata[:, :], min_cells_detected)) - - # cds_pdata <- dplyr::group_by_(dplyr::select_(rownames_to_column(pData(cds)), "rowname", .dots=model_terms), .dots - # =model_terms) - # disp_table <- as.data.frame(cds_pdata %>% do(disp_calc_helper_NB(cds[,.$rowname], cds@expressionFamily, min_cells_ - # detected))) - for ind in range(len(layers)): - layer, disp_table = layers[ind], disp_tables[ind] - - if disp_table is None: - raise Exception("Parametric dispersion fitting failed, please set a different lowerDetectionLimit") - - disp_table = disp_table.loc[np.where(disp_table["mu"] != np.nan)[0], :] - - res = parametric_dispersion_fit(disp_table) - fit, coefs, good = res[0], res[1], res[2] - - if removeOutliers: - # influence = fit.get_influence().cooks_distance() - # #CD is the distance and p is p-value - # (CD, p) = influence.cooks_distance - - CD = cook_dist(fit, 1 / good["mu"][:, None], good) - cooksCutoff = 4 / good.shape[0] - main_info("Removing " + str(len(CD[CD > cooksCutoff])) + " outliers") - outliers = CD > cooksCutoff - # use CD.index.values? remove genes that lost when doing parameter fitting - lost_gene = set(good.index.values).difference(set(range(len(CD)))) - outliers[lost_gene] = True - res = parametric_dispersion_fit(good.loc[~outliers, :]) - - fit, coefs = res[0], res[1] - - def ans(q): - return coefs[0] + coefs[1] / q - - if layer == "X": - logger.info_insert_adata("dispFitInfo", "uns") - adata.uns["dispFitInfo"] = { - "disp_table": good, - "disp_func": ans, - "coefs": coefs, - } - else: - logger.info_insert_adata(layer + "_dispFitInfo", "uns") - adata.uns[layer + "_dispFitInfo"] = { - "disp_table": good, - "disp_func": ans, - "coefs": coefs, - } - - return adata - - def top_table(adata: AnnData, layer: str = "X", mode: Literal["dispersion", "gini"] = "dispersion") -> pd.DataFrame: """Retrieve a table that contains gene names and other info whose dispersions/gini index are highest. @@ -348,9 +124,7 @@ def top_table(adata: AnnData, layer: str = "X", mode: Literal["dispersion", "gin if mode == "dispersion": if adata.uns[key] is None: main_warning("dispersion mode is deprecated. This mode will be removed in the future.") - estimate_dispersion(adata, layers=[layer]) - - if adata.uns[key] is None: + _estimate_dispersion(adata, layers=[layer]) raise KeyError( "Error: for adata.uns.key=%s, no dispersion model found. Please call estimate_dispersion() before calling this function" % key @@ -393,8 +167,8 @@ def select_genes_monocle( keep_filtered: Whether to keep genes that don't pass the filtering in the adata object. Defaults to True. n_top_genes: the number of top genes based on scoring method (specified by sort_by) will be selected as feature genes. Defaults to 2000. - sort_by: the sorting methods, either SVR, dispersion or Gini index, to be used to select genes. Defaults to - "SVR". TODO: Should be fixed! + sort_by: the sorting methods to be used to select genes. Should be one of the gini index or + dispersion of coefficient variation or fano. Defaults to cv_dispersion. exprs_frac_for_gene_exclusion: threshold of fractions for high fraction genes. Defaults to 1. genes_to_exclude: genes that are excluded from evaluation. Defaults to None. SVRs_kwargs: kwargs for `SVRs`. Defaults to {}. diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py index 8d4a5a7b5..25ff78a58 100644 --- a/tests/test_preprocess.py +++ b/tests/test_preprocess.py @@ -313,5 +313,5 @@ def test_regress_out(): # test_highest_frac_genes_plot(adata.copy()) # test_highest_frac_genes_plot_prefix_list(adata.copy()) # test_recipe_monocle_feature_selection_layer_simple0() - # test_gene_selection_method() + test_gene_selection_method() pass From 7e659cdaab84003dfbff14b37b7188820e6e4e0c Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Wed, 12 Apr 2023 18:35:44 -0400 Subject: [PATCH 12/28] fix multiprocess in sctransform --- dynamo/external/sctransform.py | 18 +++++++++--------- dynamo/preprocessing/Preprocessor.py | 16 +++++++--------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/dynamo/external/sctransform.py b/dynamo/external/sctransform.py index 9879fea2d..6c7278c86 100644 --- a/dynamo/external/sctransform.py +++ b/dynamo/external/sctransform.py @@ -8,7 +8,6 @@ # ================================================================= import os -from multiprocessing import Manager, Pool import numpy as np import pandas as pd @@ -128,6 +127,8 @@ def sctransform_core( """ A re-implementation of SCTransform from the Satija lab. """ + import multiprocessing + main_info("sctransform adata on layer: %s" % (layer)) X = DKM.select_layer_data(adata, layer).copy() X = sp.sparse.csr_matrix(X) @@ -139,10 +140,8 @@ def sctransform_core( genes_ix = genes.copy() X = X[:, genes] - Xraw = X.copy() gene_names = gene_names[genes] genes = np.arange(X.shape[1]) - genes_cell_count = X.sum(0).A.flatten() genes_log_gmean = np.log10(gmean(X, axis=0, eps=gmean_eps)) @@ -188,7 +187,10 @@ def sctransform_core( bin_ind = np.ceil(np.arange(1, genes_step1.size + 1) / bin_size) max_bin = max(bin_ind) - ps = Manager().dict() + ps = multiprocessing.Manager().dict() + + # create a process context of fork that copy a Python process from an existing process. + ctx = multiprocessing.get_context("fork") for i in range(1, int(max_bin) + 1): genes_bin_regress = genes_step1[bin_ind == i] @@ -197,7 +199,9 @@ def sctransform_core( mm = np.vstack((np.ones(data_step1.shape[0]), data_step1["log_umi"].values.flatten())).T pc_chunksize = umi_bin.shape[1] // os.cpu_count() + 1 - pool = Pool(os.cpu_count(), _parallel_init, [genes_bin_regress, umi_bin, gene_names, mm, ps]) + + pool = ctx.Pool(os.cpu_count(), _parallel_init, [genes_bin_regress, umi_bin, gene_names, mm, ps]) + try: pool.map(_parallel_wrapper, range(umi_bin.shape[1]), chunksize=pc_chunksize) finally: @@ -254,10 +258,6 @@ def sctransform_core( full_model_pars["theta"] = theta del full_model_pars["dispersion"] - model_pars_outliers = outliers - - regressor_data = np.vstack((np.ones(cell_attrs.shape[0]), cell_attrs["log_umi"].values)).T - d = X.data x, y = X.nonzero() mud = np.exp(full_model_pars.values[:, 0][y] + full_model_pars.values[:, 1][y] * cell_attrs["log_umi"].values[x]) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index 36be701f8..80bc07a73 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -413,9 +413,7 @@ def preprocess_adata_seurat_wo_pca( temp_logger.finish_progress(progress_name="preprocess by seurat wo pca recipe") - def config_monocle_recipe( - self, adata: AnnData, n_top_genes: int = 2000, gene_selection_method: str = "cv_dispersion" - ) -> None: + def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None: """Automatically configure the preprocessor for monocle recipe. Args: @@ -460,7 +458,7 @@ def config_monocle_recipe( self.select_genes = select_genes_monocle self.select_genes_kwargs = { "n_top_genes": n_top_genes, - "sort_by": "cv_dispersion" if gene_selection_method is None else gene_selection_method, + "sort_by": "cv_dispersion", "keep_filtered": True, "SVRs_kwargs": { "relative_expr": True, @@ -519,7 +517,7 @@ def preprocess_adata_monocle( temp_logger.finish_progress(progress_name="preprocess") - def config_seurat_recipe(self, adata: AnnData, gene_selection_method: str = "seurat_dispersion") -> None: + def config_seurat_recipe(self, adata: AnnData) -> None: """Automatically configure the preprocessor for using the seurat style recipe. Args: @@ -529,7 +527,7 @@ def config_seurat_recipe(self, adata: AnnData, gene_selection_method: str = "seu self.config_monocle_recipe(adata) self.select_genes = select_genes_by_seurat_recipe self.select_genes_kwargs = { - "algorithm": "seurat_dispersion" if gene_selection_method is None else gene_selection_method, + "algorithm": "seurat_dispersion", "n_top_genes": 2000, } self.normalize_by_cells_function_kwargs = {"skip_log": True} @@ -586,6 +584,7 @@ def config_sctransform_recipe(self, adata: AnnData) -> None: "min_cell_u": 5, "min_count_u": 1, } + self.select_genes = select_genes_by_seurat_recipe self.select_genes_kwargs = {"inplace": True} self.sctransform_kwargs = {"layers": raw_layers, "n_top_genes": 2000} self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50} @@ -740,7 +739,6 @@ def preprocess_adata( recipe: Literal[ "monocle", "seurat", "sctransform", "pearson_residuals", "monocle_pearson_residuals" ] = "monocle", - gene_selection_method: Optional[str] = None, tkey: Optional[str] = None, ) -> None: """Preprocess the AnnData object with the recipe specified. @@ -757,10 +755,10 @@ def preprocess_adata( """ if recipe == "monocle": - self.config_monocle_recipe(adata, gene_selection_method=gene_selection_method) + self.config_monocle_recipe(adata) self.preprocess_adata_monocle(adata, tkey=tkey) elif recipe == "seurat": - self.config_seurat_recipe(adata, gene_selection_method=gene_selection_method) + self.config_seurat_recipe(adata) self.preprocess_adata_seurat(adata, tkey=tkey) elif recipe == "sctransform": self.config_sctransform_recipe(adata) From 7a5c6e252c124631c3d205bf745d4c0d10d3ba97 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Thu, 13 Apr 2023 12:21:27 -0400 Subject: [PATCH 13/28] delete duplicated filtering cells operation --- dynamo/preprocessing/Preprocessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index 80bc07a73..c8c665141 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -501,7 +501,6 @@ def preprocess_adata_monocle( self._filter_cells_by_outliers(adata) self._filter_genes_by_outliers(adata) - self._filter_cells_by_outliers(adata) self._select_genes(adata) # gene selection has been completed above. Now we need to append/delete/force selected gene list required by users. From 9272bc87b132629efff9d335728537fc75532da0 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Fri, 14 Apr 2023 16:15:19 -0400 Subject: [PATCH 14/28] Add checking of tkey and cell_cycle_score --- dynamo/plot/preprocess.py | 8 ++--- dynamo/preprocessing/Preprocessor.py | 50 ++++++++++++++++++++++---- dynamo/preprocessing/gene_selection.py | 4 ++- 3 files changed, 51 insertions(+), 11 deletions(-) diff --git a/dynamo/plot/preprocess.py b/dynamo/plot/preprocess.py index ca29fa358..ba0474745 100755 --- a/dynamo/plot/preprocess.py +++ b/dynamo/plot/preprocess.py @@ -677,11 +677,11 @@ def feature_genes( import matplotlib.pyplot as plt mode = adata.uns["feature_selection"] if mode is None else mode - layer = DynamoAdataKeyManager.get_available_layer_keys(adata, layer, include_protein=False)[0] - uns_store_key = None - if mode == "dispersion": + + if mode == "dispersion": # TODO: Deprecated. + main_warning("dispersion is deprecated for soon-to-be removed features.") uns_store_key = "dispFitInfo" if layer in ["raw", "X"] else layer + "_dispFitInfo" table = top_table(adata, layer) @@ -689,7 +689,7 @@ def feature_genes( np.nanmin(table["mean_expression"]), np.nanmax(table["mean_expression"]), ) - elif mode == "SVR": + elif "_dispersion" in mode: # "cv_dispersion", "fano_dispersion" prefix = "" if layer == "X" else layer + "_" uns_store_key = "velocyto_SVR" if layer == "raw" or layer == "X" else layer + "_velocyto_SVR" diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index c8c665141..959ab723c 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -12,6 +12,7 @@ from ..configuration import DKM from ..dynamo_logger import ( LoggerManager, + main_debug, main_info, main_info_insert_adata, main_warning, @@ -21,6 +22,7 @@ sctransform, select_genes_by_pearson_residuals, ) +from .cell_cycle import cell_cycle_scores from .gene_selection import select_genes_by_seurat_recipe, select_genes_monocle from .preprocess import normalize_cell_expr_by_size_factors_legacy, pca from .preprocessor_utils import _infer_labeling_experiment_type @@ -66,7 +68,8 @@ def __init__( gene_append_list: List[str] = [], gene_exclude_list: List[str] = [], force_gene_list: Optional[List[str]] = None, - sctransform_kwargs={}, + sctransform_kwargs: dict = {}, + cell_cycle_score_kwargs: dict = {}, ) -> None: """Preprocessor constructor. @@ -119,6 +122,7 @@ def __init__( self.pca = pca_function self.pca_kwargs = pca_kwargs + self.cell_cycle_score = cell_cycle_scores # self.n_top_genes = n_top_genes self.convert_gene_name = convert_gene_name_function @@ -134,6 +138,7 @@ def __init__( self.select_genes_kwargs = select_genes_kwargs self.sctransform_kwargs = sctransform_kwargs self.normalize_selected_genes_kwargs = normalize_selected_genes_kwargs + self.cell_cycle_score_kwargs = cell_cycle_score_kwargs def add_experiment_info( self, adata: AnnData, tkey: Optional[str] = None, experiment_type: Optional[str] = None @@ -164,14 +169,21 @@ def add_experiment_info( ) = detect_experiment_datatype(adata) # check whether tkey info exists if has_labeling if has_labeling: - main_info("data contains labeling info, checking tkey:" + str(tkey)) - if tkey not in adata.obs.keys(): - raise ValueError("tkey:%s encoding the labeling time is not existed in your adata." % (str(tkey))) + main_debug("data contains labeling info, checking tkey:" + str(tkey)) if tkey is not None and adata.obs[tkey].max() > 60: main_warning( "Looks like you are using minutes as the time unit. For the purpose of numeric stability, " "we recommend using hour as the time unit." ) + if tkey not in adata.obs.keys(): + if (tkey is None) and (DKM.UNS_PP_TKEY in adata.obs.keys()): + tkey = DKM.UNS_PP_TKEY + main_warning( + "No 'tkey' value was given despite 'tkey' information in the adata, " + "so we will use 'time' in the adata as the default." + ) + else: + raise ValueError("tkey:%s encoding the labeling time is not existed in your adata." % (str(tkey))) adata.uns["pp"]["tkey"] = tkey adata.uns["pp"]["has_splicing"] = has_splicing @@ -382,8 +394,7 @@ def _log1p(self, adata: AnnData) -> None: self.log1p(adata, **self.log1p_kwargs) def _pca(self, adata: AnnData) -> None: - """Perform pca reduction with args specified in the preprocessor's - `pca_kwargs`. + """Perform pca reduction with args specified in the preprocessor's `pca_kwargs`. Args: adata: an AnnData object. @@ -393,6 +404,24 @@ def _pca(self, adata: AnnData) -> None: main_info("reducing dimension by PCA...") self.pca(adata, **self.pca_kwargs) + def _cell_cycle_score(self, adata: AnnData) -> None: + """Estimate cell cycle stage of each cell based on its gene expression pattern. + + Args: + adata: an AnnData object. + """ + + if self.cell_cycle_score: + main_debug("cell cycle scoring...") + try: + self.cell_cycle_score(adata, **self.cell_cycle_score_kwargs) + except Exception: + main_warning( + "\nDynamo is not able to perform cell cycle staging for you automatically. \n" + "Since dyn.pl.phase_diagram in dynamo by default colors cells by its cell-cycle stage, \n" + "you need to set color argument accordingly if confronting errors related to this." + ) + def preprocess_adata_seurat_wo_pca( self, adata: AnnData, tkey: Optional[str] = None, experiment_type: Optional[str] = None ) -> None: @@ -480,6 +509,14 @@ def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None self.pca = pca self.pca_kwargs = {"pca_key": "X_pca"} + self.cell_cycle_score_kwargs = { + "layer": None, + "gene_list": None, + "refine": True, + "threshold": 0.3, + "copy": False, + } + def preprocess_adata_monocle( self, adata: AnnData, tkey: Optional[str] = None, experiment_type: Optional[str] = None ) -> None: @@ -513,6 +550,7 @@ def preprocess_adata_monocle( self._log1p(adata) self._pca(adata) + self._cell_cycle_score(adata) temp_logger.finish_progress(progress_name="preprocess") diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 9c6e51171..b9c3a64f7 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -22,6 +22,7 @@ main_info_insert_adata_var, main_warning, ) +from ._deprecated import _estimate_dispersion from .preprocessor_utils import ( calc_sz_factor, get_nan_or_inf_data_bool_mask, @@ -29,7 +30,6 @@ seurat_get_mean_var, ) from .utils import compute_gene_exp_fraction, merge_adata_attrs -from ._deprecated import _estimate_dispersion def Gini(adata: AnnData, layers: Union[Literal["all"], List[str]] = "all") -> AnnData: @@ -234,6 +234,8 @@ def select_genes_monocle( adata._inplace_subset_var(filter_bool) adata.var["use_for_pca"] = True + adata.uns["feature_selection"] = sort_by + def select_genes_by_svr( adata_ori: AnnData, From e64004b6ecc0bb783ecd2b58cefef7046da056f2 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Fri, 14 Apr 2023 17:27:54 -0400 Subject: [PATCH 15/28] fix mode in dyn.pl.feature_genes --- dynamo/plot/preprocess.py | 26 ++++++++++++++++++-------- dynamo/preprocessing/gene_selection.py | 4 ++-- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/dynamo/plot/preprocess.py b/dynamo/plot/preprocess.py index ba0474745..16f324c41 100755 --- a/dynamo/plot/preprocess.py +++ b/dynamo/plot/preprocess.py @@ -9,7 +9,7 @@ from ..configuration import DynamoAdataKeyManager from ..dynamo_logger import main_warning from ..preprocessing import preprocess as pp -from ..preprocessing.gene_selection import top_table +from ..preprocessing.gene_selection import get_prediction_by_svr, top_table from ..preprocessing.utils import detect_experiment_datatype from ..tools.utils import get_mapper, update_dict from .utils import save_fig @@ -705,15 +705,25 @@ def feature_genes( np.nanmin(table[prefix + "log_m"]), np.nanmax(table[prefix + "log_m"]), ) + else: # TODO: Gini? + raise NotImplementedError(f"The mode{mode} to plot the feature genes not implemented yet") ordering_genes = adata.var["use_for_pca"] if "use_for_pca" in adata.var.columns else None mu_linspace = np.linspace(x_min, x_max, num=1000) - fit = ( - adata.uns[uns_store_key]["disp_func"](mu_linspace) - if mode == "dispersion" - else adata.uns[uns_store_key]["SVR"](mu_linspace.reshape(-1, 1)) - ) + if "_dispersion" in mode: + mean = adata.uns[uns_store_key]["mean"] + cv = adata.uns[uns_store_key]["cv"] + svr_gamma = adata.uns[uns_store_key]["svr_gamma"] + fit = get_prediction_by_svr(mean, cv, svr_gamma) + fit = fit(mu_linspace.reshape(-1, 1)) + else: + raise NotImplementedError(f"The mode{mode} to plot the feature genes not implemented yet") + # fit = ( + # adata.uns[uns_store_key]["disp_func"](mu_linspace) + # if mode == "dispersion" + # else adata.uns[uns_store_key]["SVR"](mu_linspace.reshape(-1, 1)) + # ) plt.figure(figsize=figsize) plt.plot(mu_linspace, fit, alpha=0.4, color="r") @@ -732,7 +742,7 @@ def feature_genes( alpha=1, color="xkcd:red", ) - elif mode == "SVR": + elif "_dispersion" in mode: ax = plt.scatter( valid_disp_table[prefix + "log_m"], valid_disp_table[prefix + "log_cv"], @@ -751,7 +761,7 @@ def feature_genes( alpha=0.5, color="xkcd:grey", ) - elif mode == "SVR": + elif "_dispersion" in mode: ax = plt.scatter( neg_disp_table[prefix + "log_m"], neg_disp_table[prefix + "log_cv"], diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index b9c3a64f7..6c4762e94 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -301,8 +301,8 @@ def select_genes_by_svr( score = -score # Now we can get "SVR" from get_prediction_by_svr - # key = "velocyto_SVR" if layer == "raw" or layer == "X" else layer + "_velocyto_SVR" - # adata_ori.uns[key] = {"SVR": fitted_fun} + key = "velocyto_SVR" if layer == "raw" or layer == "X" else layer + "_velocyto_SVR" + adata_ori.uns[key] = {"mean": mean, "cv": cv, "svr_gamma": svr_gamma} prefix = "" if layer == "X" else layer + "_" (adata.var[prefix + "log_m"], adata.var[prefix + "log_cv"], adata.var[prefix + "score"],) = ( From dc4c61775b6efd60172a01b3d66401f4e41c9365 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Fri, 14 Apr 2023 19:45:42 -0400 Subject: [PATCH 16/28] Organize logs --- dynamo/configuration.py | 4 +- dynamo/dynamo_logger.py | 4 +- dynamo/plot/preprocess.py | 2 +- dynamo/preprocessing/Preprocessor.py | 59 +++++++++------------- dynamo/preprocessing/cell_cycle.py | 8 +-- dynamo/preprocessing/preprocess.py | 3 +- dynamo/preprocessing/preprocessor_utils.py | 23 ++++----- dynamo/preprocessing/utils.py | 27 +++++----- 8 files changed, 57 insertions(+), 73 deletions(-) diff --git a/dynamo/configuration.py b/dynamo/configuration.py index 3f6e55eac..3a30406f9 100755 --- a/dynamo/configuration.py +++ b/dynamo/configuration.py @@ -9,7 +9,7 @@ from cycler import cycler from matplotlib import cm, colors, rcParams -from .dynamo_logger import main_info, main_warning +from .dynamo_logger import main_debug, main_info class DynamoAdataKeyManager: @@ -786,5 +786,5 @@ def set_pub_style_mpltex(): # initialize DynamoSaveConfig and DynamoVisConfig mode defaults DynamoAdataConfig.update_data_store_mode("full") -main_info("setting visualization default mode in dynamo. Your customized matplotlib settings might be overritten.") +main_debug("setting visualization default mode in dynamo. Your customized matplotlib settings might be overwritten.") DynamoVisConfig.set_default_mode() diff --git a/dynamo/dynamo_logger.py b/dynamo/dynamo_logger.py index 812d41b91..e110ccb47 100644 --- a/dynamo/dynamo_logger.py +++ b/dynamo/dynamo_logger.py @@ -154,8 +154,8 @@ def error(self, message, indent_level=1, *args, **kwargs): def info_insert_adata(self, key, adata_attr="obsm", indent_level=1, *args, **kwargs): message = " %s to %s in AnnData Object." % (key, adata_attr) - message = format_logging_message(message, logging.INFO, indent_level=indent_level) - return self.logger.error(message, *args, **kwargs) + message = format_logging_message(message, logging.DEBUG, indent_level=indent_level) + return self.logger.debug(message, *args, **kwargs) def info_insert_adata_var(self, key, indent_level=1, *args, **kwargs): return self.info_insert_adata(self, key, adata_attr="var", indent_level=1, *args, **kwargs) diff --git a/dynamo/plot/preprocess.py b/dynamo/plot/preprocess.py index 16f324c41..2ce051d14 100755 --- a/dynamo/plot/preprocess.py +++ b/dynamo/plot/preprocess.py @@ -1090,7 +1090,7 @@ def highest_frac_genes( else: main_warning( - "%s not in adata.var, ignoring the gene annotation key when plotting", + "%s not in adata.var, ignoring the gene annotation key when plotting" % gene_annotation_key, indent_level=2, ) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index 959ab723c..993472bba 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -244,18 +244,17 @@ def standardize_adata(self, adata: AnnData, tkey: str, experiment_type: str) -> self.add_experiment_info(adata, tkey, experiment_type) main_info_insert_adata("tkey=%s" % tkey, "uns['pp']", indent_level=2) main_info_insert_adata("experiment_type=%s" % experiment_type, "uns['pp']", indent_level=2) - main_info("making adata observation index unique...") + main_debug("making adata observation index unique...") self.convert_layers2csr(adata) if self.collapse_species_adata: - main_info("applying collapse species adata...") + main_debug("applying collapse species adata...") self.collapse_species_adata(adata) if self.convert_gene_name: - main_info("applying convert_gene_name function...") self.convert_gene_name(adata) - main_info("making adata observation index unique after gene name conversion...") + main_debug("making adata observation index unique after gene name conversion...") self.unique_var_obs_adata(adata) def _filter_cells_by_outliers(self, adata: AnnData) -> None: @@ -267,8 +266,8 @@ def _filter_cells_by_outliers(self, adata: AnnData) -> None: """ if self.filter_cells_by_outliers: - main_info("filtering outlier cells...") - main_info("cell filter kwargs:" + str(self.filter_cells_by_outliers_kwargs)) + main_debug("filtering outlier cells...") + main_debug("cell filter kwargs:" + str(self.filter_cells_by_outliers_kwargs)) self.filter_cells_by_outliers(adata, **self.filter_cells_by_outliers_kwargs) def _filter_genes_by_outliers(self, adata: AnnData) -> None: @@ -280,8 +279,8 @@ def _filter_genes_by_outliers(self, adata: AnnData) -> None: """ if self.filter_genes_by_outliers: - main_info("filtering outlier genes...") - main_info("gene filter kwargs:" + str(self.filter_genes_by_outliers_kwargs)) + main_debug("filtering outlier genes...") + main_debug("gene filter kwargs:" + str(self.filter_genes_by_outliers_kwargs)) self.filter_genes_by_outliers(adata, **self.filter_genes_by_outliers_kwargs) def _select_genes(self, adata: AnnData) -> None: @@ -293,8 +292,8 @@ def _select_genes(self, adata: AnnData) -> None: """ if self.select_genes: - main_info("selecting genes...") - main_info("select_genes kwargs:" + str(self.select_genes_kwargs)) + main_debug("selecting genes...") + main_debug("select_genes kwargs:" + str(self.select_genes_kwargs)) self.select_genes(adata, **self.select_genes_kwargs) def _append_gene_list(self, adata: AnnData) -> None: @@ -305,7 +304,7 @@ def _append_gene_list(self, adata: AnnData) -> None: adata: an AnnData object. """ - if self.gene_append_list is not None: + if len(self.gene_append_list) > 0: append_genes = adata.var.index.intersection(self.gene_append_list) adata.var.loc[append_genes, DKM.VAR_USE_FOR_PCA] = True main_info("appended %d extra genes as required..." % len(append_genes)) @@ -318,7 +317,7 @@ def _exclude_gene_list(self, adata: AnnData) -> None: adata: an AnnData object. """ - if self.gene_exclude_list is not None: + if len(self.gene_exclude_list) > 0: exclude_genes = adata.var.index.intersection(self.gene_exclude_list) adata.var.loc[exclude_genes, DKM.VAR_USE_FOR_PCA] = False main_info("excluded %d genes as required..." % len(exclude_genes)) @@ -339,40 +338,28 @@ def _force_gene_list(self, adata: AnnData) -> None: "OVERWRITE all gene selection results above according to user gene list inputs. %d genes in use." % len(forced_genes) ) - else: - main_info("self.force_gene_list is None, skipping filtering by gene list...") def _normalize_selected_genes(self, adata: AnnData) -> None: - """Normalize selected genes with method specified in the preprocessor's - `normalize_selected_genes` + """Normalize selected genes with method specified in the preprocessor's `normalize_selected_genes` Args: adata: an AnnData object. """ - if not callable(self.normalize_selected_genes): - main_info( - "skipping normalize by selected genes as preprocessor normalize_selected_genes is not callable..." - ) - return - - main_info("normalizing selected genes...") - self.normalize_selected_genes(adata, **self.normalize_selected_genes_kwargs) + if callable(self.normalize_selected_genes): + main_debug("normalizing selected genes...") + self.normalize_selected_genes(adata, **self.normalize_selected_genes_kwargs) def _normalize_by_cells(self, adata: AnnData) -> None: - """Performing cell-wise normalization based on method specified as the - preprocessor's `normalize_by_cells`. + """Performing cell-wise normalization based on method specified as the preprocessor's `normalize_by_cells`. Args: adata: an AnnData object. """ - if not callable(self.normalize_by_cells): - main_info("skipping normalize by cells as preprocessor normalize_by_cells is not callable...") - return - - main_info("applying normalize by cells function...") - self.normalize_by_cells(adata, **self.normalize_by_cells_function_kwargs) + if callable(self.normalize_by_cells): + main_debug("applying normalize by cells function...") + self.normalize_by_cells(adata, **self.normalize_by_cells_function_kwargs) def _log1p(self, adata: AnnData) -> None: """Perform log1p on the data with args specified in the preprocessor's @@ -390,7 +377,7 @@ def _log1p(self, adata: AnnData) -> None: # TODO: the following line is for monocle recipe and later dynamics matrix recovery # refactor with dynamics module adata.uns["pp"]["norm_method"] = "log1p" - main_info("applying log1p transformation on expression matrix data (adata.X)...") + main_debug("applying log1p transformation on expression matrix data (adata.X)...") self.log1p(adata, **self.log1p_kwargs) def _pca(self, adata: AnnData) -> None: @@ -530,7 +517,7 @@ def preprocess_adata_monocle( would be inferred from the data. Defaults to None. """ - main_info("Running preprocessing pipeline...") + main_info("Running monocle preprocessing pipeline...") temp_logger = LoggerManager.gen_logger("preprocessor-monocle") temp_logger.log_time() @@ -593,7 +580,7 @@ def preprocess_adata_seurat( temp_logger = LoggerManager.gen_logger("preprocessor-seurat") temp_logger.log_time() - main_info("Applying Seurat recipe preprocessing...") + main_info("Running Seurat recipe preprocessing...") self.standardize_adata(adata, tkey, experiment_type) self._filter_genes_by_outliers(adata) @@ -644,7 +631,7 @@ def preprocess_adata_sctransform( temp_logger = LoggerManager.gen_logger("preprocessor-sctransform") temp_logger.log_time() - main_info("Applying Sctransform recipe preprocessing...") + main_info("Running Sctransform recipe preprocessing...") self.standardize_adata(adata, tkey, experiment_type) diff --git a/dynamo/preprocessing/cell_cycle.py b/dynamo/preprocessing/cell_cycle.py index 9574735c8..fa9a9bbe3 100644 --- a/dynamo/preprocessing/cell_cycle.py +++ b/dynamo/preprocessing/cell_cycle.py @@ -440,7 +440,7 @@ def get_cell_phase( # pick maximal score as the phase for that cell cell_cycle_scores["cell_cycle_phase"] = cell_cycle_scores.idxmax(axis=1) cell_cycle_scores["cell_cycle_phase"] = cell_cycle_scores["cell_cycle_phase"].astype("category") - cell_cycle_scores["cell_cycle_phase"].cat.set_categories(phase_list, inplace=True) + cell_cycle_scores["cell_cycle_phase"].cat.set_categories(phase_list) def progress_ratio(x, phase_list): ind = phase_list.index(x["cell_cycle_phase"]) @@ -458,9 +458,9 @@ def progress_ratio(x, phase_list): # order of cell within cell cycle phase cell_cycle_scores["cell_cycle_order"] = cell_cycle_scores.groupby("cell_cycle_phase").cumcount() - cell_cycle_scores["cell_cycle_order"] = cell_cycle_scores.groupby("cell_cycle_phase")["cell_cycle_order"].apply( - lambda x: x / (len(x) - 1) - ) + cell_cycle_scores["cell_cycle_order"] = cell_cycle_scores.groupby("cell_cycle_phase", group_keys=False)[ + "cell_cycle_order" + ].apply(lambda x: x / (len(x) - 1)) return cell_cycle_scores diff --git a/dynamo/preprocessing/preprocess.py b/dynamo/preprocessing/preprocess.py index 446ed6ab2..7b7b5b4ca 100755 --- a/dynamo/preprocessing/preprocess.py +++ b/dynamo/preprocessing/preprocess.py @@ -18,6 +18,7 @@ from ..dynamo_logger import ( LoggerManager, main_critical, + main_debug, main_info, main_info_insert_adata_obsm, main_info_insert_adata_uns, @@ -1450,7 +1451,7 @@ def highest_frac_genes( not_all_zero = cell_expression_sum != 0 filtered_adata = adata[not_all_zero, :] cell_expression_sum = cell_expression_sum[not_all_zero] - main_info("%d rows(cells or subsets) are not zero. zero total RNA cells are removed." % np.sum(not_all_zero)) + main_debug("%d rows(cells or subsets) are not zero. zero total RNA cells are removed." % np.sum(not_all_zero)) valid_gene_set = set() prefix_to_genes = {} diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index 633f3f420..9a4ba5498 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -100,10 +100,9 @@ def _infer_labeling_experiment_type(adata: anndata.AnnData, tkey: str) -> Litera # total labeled RNA amount will increase (decrease) in kinetic (degradation) experiments over time. experiment_type = "kin" if k > 0 else "deg" - main_info( - f"\nDynamo detects your labeling data is from a {experiment_type} experiment. If experiment type is not corrent, please correct " - f"\nthis via supplying the correct experiment_type (one of `one-shot`, `kin`, `deg`) as " - f"needed." + main_debug( + f"\nDynamo has detected that your labeling data is from a kin experiment. \nIf the experiment type is incorrect, " + f"please provide the correct experiment_type (one-shot, kin, or deg)." ) return experiment_type @@ -543,10 +542,10 @@ def filter_cells_by_outliers( main_info_insert_adata_obs(obs_store_key) if keep_filtered: - main_info("keep filtered cell", indent_level=2) + main_debug("keep filtered cell", indent_level=2) adata.obs[obs_store_key] = filter_bool else: - main_info("inplace subsetting adata by filtered cells", indent_level=2) + main_debug("inplace subsetting adata by filtered cells", indent_level=2) adata._inplace_subset_obs(filter_bool) adata.obs[obs_store_key] = True @@ -578,23 +577,23 @@ def get_filter_mask_cells_by_outliers( for i, layer in enumerate(layers): if layer not in layer2range: - main_info( + main_debug( "skip filtering cells by layer: %s as it is not in the layer2range mapping passed in:" % layer, indent_level=2, ) continue if not DKM.check_if_layer_exist(adata, layer): - main_info("skip filtering by layer:%s as it is not in adata." % layer) + main_debug("skip filtering by layer:%s as it is not in adata." % layer) continue - main_info("filtering cells by layer:%s" % layer, indent_level=2) + main_debug("filtering cells by layer:%s" % layer, indent_level=2) layer_data = DKM.select_layer_data(adata, layer) detected_mask = detected_mask & get_sum_in_range_mask( layer_data, layer2range[layer][0], layer2range[layer][1], axis=1, data_min_val_threshold=0 ) if shared_count is not None: - main_info("filtering cells by shared counts from all layers", indent_level=2) + main_debug("filtering cells by shared counts from all layers", indent_level=2) layers = DKM.get_available_layer_keys(adata, layers, False) detected_mask = detected_mask & get_inrange_shared_counts_mask(adata, layers, shared_count, "cell") @@ -831,7 +830,7 @@ def normalize_cell_expr_by_size_factors( _norm_method = None if _norm_method in [np.log1p, np.log, np.log2, Freeman_Tukey, None] and layer != "protein": - main_info("applying %s to layer<%s>" % (_norm_method, layer)) + main_debug("applying %s to layer<%s>" % (_norm_method, layer)) CM = normalize_mat_monocle(CM, szfactors, relative_expr, pseudo_expr, _norm_method) elif layer == "protein": # norm_method == 'clr': @@ -858,7 +857,7 @@ def normalize_cell_expr_by_size_factors( main_warning(_norm_method + " is not implemented yet") if layer in ["raw", "X"]: - main_info("set adata to normalized data.") + main_info("set adata to normalized data using %s" % _norm_method) adata.X = CM elif layer == "protein" and "protein" in adata.obsm_keys(): main_info_insert_adata_obsm("X_protein") diff --git a/dynamo/preprocessing/utils.py b/dynamo/preprocessing/utils.py index 5e04a5b1b..5695e0794 100755 --- a/dynamo/preprocessing/utils.py +++ b/dynamo/preprocessing/utils.py @@ -14,8 +14,8 @@ import scipy.sparse import statsmodels.api as sm from anndata import AnnData -from scipy.sparse.linalg import LinearOperator, svds from scipy.sparse import csc_matrix, csr_matrix, issparse +from scipy.sparse.linalg import LinearOperator, svds from sklearn.decomposition import PCA, TruncatedSVD from sklearn.utils import check_random_state from sklearn.utils.extmath import svd_flip @@ -402,7 +402,7 @@ def _merge_by_diff(origin_df: pd.DataFrame, diff_df: pd.DataFrame) -> pd.DataFra The merged DataFrame. """ - _columns = set(diff_df.columns).difference(origin_df.columns) + _columns = list(set(diff_df.columns).difference(origin_df.columns)) new_df = origin_df.merge(diff_df[_columns], how="left", left_index=True, right_index=True) return new_df.loc[origin_df.index, :] @@ -622,7 +622,7 @@ def sz_util( CM = adata.layers[layer] if CM is None else CM if round_exprs: - main_info("rounding expression data of layer: %s during size factor calculation" % (layer)) + main_debug("rounding expression data of layer: %s during size factor calculation" % (layer)) if issparse(CM): CM.data = np.round(CM.data, 0) else: @@ -778,6 +778,7 @@ def decode(adata: anndata.AnnData) -> None: # --------------------------------------------------------------------------------------------------- # pca + def _truncatedSVD_with_center( X: Union[csc_matrix, csr_matrix], n_components: int = 30, @@ -844,7 +845,7 @@ def rmatmat(x): ) # Solve SVD without calculating individuals entries in LinearOperator. - U, Sigma, VT = svds(X_centered, solver='arpack', k=n_components, v0=v0) + U, Sigma, VT = svds(X_centered, solver="arpack", k=n_components, v0=v0) Sigma = Sigma[::-1] U, VT = svd_flip(U[:, ::-1], VT[::-1]) X_transformed = U * Sigma @@ -865,11 +866,11 @@ def rmatmat(x): ) X_pca = result_dict["X_pca"] fit.components_ = result_dict["components_"] - fit.explained_variance_ratio_ = result_dict[ - "explained_variance_ratio_"] + fit.explained_variance_ratio_ = result_dict["explained_variance_ratio_"] return fit, X_pca + def _pca_fit( X: np.ndarray, pca_func: Callable, @@ -893,7 +894,7 @@ class from sklearn.decomposition. A tuple containing two elements: - The fitted PCA object, which has a 'fit' and 'transform' method. - The transformed array X_pca of shape (n_samples, n_components). - """ + """ fit = pca_func( n_components=min(n_components, X.shape[1] - 1), **kwargs, @@ -1001,6 +1002,7 @@ def pca( if use_incremental_PCA: from sklearn.decomposition import IncrementalPCA + fit, X_pca = _pca_fit( X_data, pca_func=IncrementalPCA, @@ -1028,10 +1030,7 @@ def pca( # data. It only performs SVD decomposition, which is the second part # in our _truncatedSVD_with_center function. fit, X_pca = _pca_fit( - X_data, - pca_func=TruncatedSVD, - n_components=n_pca_components + 1, - random_state=random_state + X_data, pca_func=TruncatedSVD, n_components=n_pca_components + 1, random_state=random_state ) # first columns is related to the total UMI (or library size) X_pca = X_pca[:, 1:] @@ -1039,13 +1038,11 @@ def pca( adata.obsm[pca_key] = X_pca if use_incremental_PCA or adata.n_obs < use_truncated_SVD_threshold: adata.uns[pcs_key] = fit.components_.T - adata.uns[ - "explained_variance_ratio_"] = fit.explained_variance_ratio_ + adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_ else: # first columns is related to the total UMI (or library size) adata.uns[pcs_key] = fit.components_.T[:, 1:] - adata.uns[ - "explained_variance_ratio_"] = fit.explained_variance_ratio_[1:] + adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_[1:] adata.uns["pca_mean"] = fit.mean_ if hasattr(fit, "mean_") else None if return_all: From dfcc62dc4d755221f1aeecc44dc5705df92cfd5e Mon Sep 17 00:00:00 2001 From: LoveLennone <117324201+LoveLennone@users.noreply.github.com> Date: Sat, 15 Apr 2023 23:14:37 -0400 Subject: [PATCH 17/28] fix logging messages --- dynamo/dynamo_logger.py | 8 ++++---- dynamo/preprocessing/Preprocessor.py | 14 +++++++------- dynamo/preprocessing/cell_cycle.py | 2 +- dynamo/preprocessing/preprocessor_utils.py | 2 ++ dynamo/tools/dimension_reduction.py | 6 +++--- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/dynamo/dynamo_logger.py b/dynamo/dynamo_logger.py index e110ccb47..d0ebbb1b0 100644 --- a/dynamo/dynamo_logger.py +++ b/dynamo/dynamo_logger.py @@ -189,18 +189,18 @@ def report_progress(self, percent=None, count=None, total=None, progress_name="" def finish_progress(self, progress_name="", time_unit="s", indent_level=1): self.log_time() - self.report_progress(percent=100, progress_name=progress_name) + #self.report_progress(percent=100, progress_name=progress_name) saved_terminator = self.logger_stream_handler.terminator self.logger_stream_handler.terminator = "" - self.logger.info("\n") + #self.logger.info("\n") self.logger_stream_handler.flush() self.logger_stream_handler.terminator = saved_terminator if time_unit == "s": - self.info("[%s] finished [%.4fs]" % (progress_name, self.time_passed), indent_level=indent_level) + self.info("[%s] completed [%.4fs]" % (progress_name, self.time_passed), indent_level=indent_level) elif time_unit == "ms": - self.info("[%s] finished [%.4fms]" % (progress_name, self.time_passed * 1e3), indent_level=indent_level) + self.info("[%s] completed [%.4fms]" % (progress_name, self.time_passed * 1e3), indent_level=indent_level) else: raise NotImplementedError # self.logger.info("|") diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index 993472bba..c2c17cf53 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -388,7 +388,7 @@ def _pca(self, adata: AnnData) -> None: """ if self.pca: - main_info("reducing dimension by PCA...") + main_info("PCA dimension reduction") self.pca(adata, **self.pca_kwargs) def _cell_cycle_score(self, adata: AnnData) -> None: @@ -539,7 +539,7 @@ def preprocess_adata_monocle( self._pca(adata) self._cell_cycle_score(adata) - temp_logger.finish_progress(progress_name="preprocess") + temp_logger.finish_progress(progress_name="Preprocessor-monocle") def config_seurat_recipe(self, adata: AnnData) -> None: """Automatically configure the preprocessor for using the seurat style recipe. @@ -588,7 +588,7 @@ def preprocess_adata_seurat( self._select_genes(adata) self._log1p(adata) self._pca(adata) - temp_logger.finish_progress(progress_name="preprocess by seurat recipe") + temp_logger.finish_progress(progress_name="Preprocessor-seurat") def config_sctransform_recipe(self, adata: AnnData) -> None: """Automatically configure the preprocessor for using the sctransform @@ -643,7 +643,7 @@ def preprocess_adata_sctransform( self.sctransform(adata, **self.sctransform_kwargs) self._pca(adata) - temp_logger.finish_progress(progress_name="preprocess by sctransform recipe") + temp_logger.finish_progress(progress_name="Preprocessor-sctransform") def config_pearson_residuals_recipe(self, adata: AnnData) -> None: """Automatically configure the preprocessor for using the Pearson @@ -682,7 +682,7 @@ def preprocess_adata_pearson_residuals( would be inferred from the data. Defaults to None. """ - temp_logger = LoggerManager.gen_logger("preprocessor-sctransform") + temp_logger = LoggerManager.gen_logger("Preprocessor-pearson residual") temp_logger.log_time() self.standardize_adata(adata, tkey, experiment_type) @@ -690,7 +690,7 @@ def preprocess_adata_pearson_residuals( self._normalize_selected_genes(adata) self._pca(adata) - temp_logger.finish_progress(progress_name="preprocess by pearson residual recipe") + temp_logger.finish_progress(progress_name="Preprocessor-pearson residual") def config_monocle_pearson_residuals_recipe(self, adata: AnnData) -> None: """Automatically configure the preprocessor for using the @@ -755,7 +755,7 @@ def preprocess_adata_monocle_pearson_residuals( # adata.layers[layer] = adata. self.pca(adata, **self.pca_kwargs) - temp_logger.finish_progress(progress_name="preprocess by monocle pearson residual recipe") + temp_logger.finish_progress(progress_name="Preprocessor-monocle-pearson-residual") def preprocess_adata( self, diff --git a/dynamo/preprocessing/cell_cycle.py b/dynamo/preprocessing/cell_cycle.py index fa9a9bbe3..f84384d23 100644 --- a/dynamo/preprocessing/cell_cycle.py +++ b/dynamo/preprocessing/cell_cycle.py @@ -508,7 +508,7 @@ def cell_cycle_scores( gene_list=gene_list, threshold=threshold, ) - temp_timer_logger.finish_progress(progress_name="cell phase estimation") + temp_timer_logger.finish_progress(progress_name="Cell Phase Estimation") cell_cycle_scores.index = adata.obs_names[cell_cycle_scores.index.values.astype("int")] diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index 9a4ba5498..07062e2a1 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -432,6 +432,7 @@ def filter_genes_by_outliers( filter_bool = filter_bool & detected_bool if filter_bool is not None else detected_bool adata.var["pass_basic_filter"] = np.array(filter_bool).flatten() + main_info("filtered out %d outlier genes" % (adata.n_vars - sum(filter_bool)), indent_level=2) if inplace: adata._inplace_subset_var(adata.var["pass_basic_filter"]) @@ -540,6 +541,7 @@ def filter_cells_by_outliers( else: filter_bool = np.array(filter_bool) & detected_bool + main_info("filtered out %d outlier cells" % (adata.n_obs - sum(filter_bool)), indent_level=2) main_info_insert_adata_obs(obs_store_key) if keep_filtered: main_debug("keep filtered cell", indent_level=2) diff --git a/dynamo/tools/dimension_reduction.py b/dynamo/tools/dimension_reduction.py index fd14d2377..3433a4467 100755 --- a/dynamo/tools/dimension_reduction.py +++ b/dynamo/tools/dimension_reduction.py @@ -78,7 +78,7 @@ def reduceDimension( adata = copy_adata(adata) if copy else adata - logger.info("retrive data for non-linear dimension reduction...", indent_level=1) + logger.debug("retrive data for non-linear dimension reduction...", indent_level=1) if X_data is None: X_data, n_components, basis = prepare_dim_reduction( adata, @@ -107,7 +107,7 @@ def reduceDimension( conn_key, dist_key, neighbor_key = _gen_neighbor_keys(neighbor_result_prefix) if enforce or not has_basis: - logger.info(f"perform {reduction_method}...", indent_level=1) + logger.info(f"[{reduction_method.upper()}] using {basis} with n_pca_components = {n_pca_components}", indent_level=1) adata = run_reduce_dim( adata, X_data, @@ -123,7 +123,7 @@ def reduceDimension( if neighbor_key not in adata.uns_keys(): neighbors(adata) - logger.finish_progress(progress_name="dimension_reduction projection") + logger.finish_progress(progress_name=reduction_method.upper()) if copy: return adata From 472ef0c14f5e00c6dac161195097163029ac60fd Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Thu, 20 Apr 2023 20:00:10 -0400 Subject: [PATCH 18/28] deprecate top_table --- dynamo/plot/preprocess.py | 57 +++------------- dynamo/preprocessing/Preprocessor.py | 93 +++++++++++++++++-------- dynamo/preprocessing/__init__.py | 8 +-- dynamo/preprocessing/_deprecated.py | 71 ++++++++++++++++--- dynamo/preprocessing/gene_selection.py | 76 ++------------------- dynamo/preprocessing/preprocess.py | 94 ++------------------------ 6 files changed, 144 insertions(+), 255 deletions(-) diff --git a/dynamo/plot/preprocess.py b/dynamo/plot/preprocess.py index 2ce051d14..bf24d85e2 100755 --- a/dynamo/plot/preprocess.py +++ b/dynamo/plot/preprocess.py @@ -9,7 +9,7 @@ from ..configuration import DynamoAdataKeyManager from ..dynamo_logger import main_warning from ..preprocessing import preprocess as pp -from ..preprocessing.gene_selection import get_prediction_by_svr, top_table +from ..preprocessing.gene_selection import get_prediction_by_svr from ..preprocessing.utils import detect_experiment_datatype from ..tools.utils import get_mapper, update_dict from .utils import save_fig @@ -649,10 +649,9 @@ def feature_genes( save_show_or_return: str = "show", save_kwargs: dict = {}, ): - """Plot selected feature genes on top of the mean vs. dispersion scatterplot. + """Plot selected feature genes on top of the mean vs. dispersion scatter plot. - Parameters - ---------- + Args: adata: :class:`~anndata.AnnData` AnnData object layer: `str` (default: `X`) @@ -664,13 +663,12 @@ def feature_genes( save_show_or_return: {'show', 'save', 'return'} (default: `show`) Whether to save, show or return the figure. save_kwargs: `dict` (default: `{}`) - A dictionary that will passed to the save_fig function. By default it is an empty dictionary and the + A dictionary that will be passed to the save_fig function. By default, it is an empty dictionary and the save_fig function will use the {"path": None, "prefix": 'feature_genes', "dpi": None, "ext": 'pdf', - "transparent": True, "close": True, "verbose": True} as its parameters. Otherwise you can provide a + "transparent": True, "close": True, "verbose": True} as its parameters. Otherwise, you can provide a dictionary that properly modify those keys according to your needs. - Returns - ------- + Returns: Nothing but plots the selected feature genes via the mean, CV plot. """ @@ -680,16 +678,7 @@ def feature_genes( layer = DynamoAdataKeyManager.get_available_layer_keys(adata, layer, include_protein=False)[0] uns_store_key = None - if mode == "dispersion": # TODO: Deprecated. - main_warning("dispersion is deprecated for soon-to-be removed features.") - uns_store_key = "dispFitInfo" if layer in ["raw", "X"] else layer + "_dispFitInfo" - - table = top_table(adata, layer) - x_min, x_max = ( - np.nanmin(table["mean_expression"]), - np.nanmax(table["mean_expression"]), - ) - elif "_dispersion" in mode: # "cv_dispersion", "fano_dispersion" + if "_dispersion" in mode: # "cv_dispersion", "fano_dispersion" prefix = "" if layer == "X" else layer + "_" uns_store_key = "velocyto_SVR" if layer == "raw" or layer == "X" else layer + "_velocyto_SVR" @@ -705,8 +694,6 @@ def feature_genes( np.nanmin(table[prefix + "log_m"]), np.nanmax(table[prefix + "log_m"]), ) - else: # TODO: Gini? - raise NotImplementedError(f"The mode{mode} to plot the feature genes not implemented yet") ordering_genes = adata.var["use_for_pca"] if "use_for_pca" in adata.var.columns else None @@ -717,13 +704,6 @@ def feature_genes( svr_gamma = adata.uns[uns_store_key]["svr_gamma"] fit = get_prediction_by_svr(mean, cv, svr_gamma) fit = fit(mu_linspace.reshape(-1, 1)) - else: - raise NotImplementedError(f"The mode{mode} to plot the feature genes not implemented yet") - # fit = ( - # adata.uns[uns_store_key]["disp_func"](mu_linspace) - # if mode == "dispersion" - # else adata.uns[uns_store_key]["SVR"](mu_linspace.reshape(-1, 1)) - # ) plt.figure(figsize=figsize) plt.plot(mu_linspace, fit, alpha=0.4, color="r") @@ -734,15 +714,7 @@ def feature_genes( ) valid_disp_table = table.iloc[valid_ind, :] - if mode == "dispersion": - ax = plt.scatter( - valid_disp_table["mean_expression"], - valid_disp_table["dispersion_empirical"], - s=3, - alpha=1, - color="xkcd:red", - ) - elif "_dispersion" in mode: + if "_dispersion" in mode: ax = plt.scatter( valid_disp_table[prefix + "log_m"], valid_disp_table[prefix + "log_cv"], @@ -753,15 +725,7 @@ def feature_genes( neg_disp_table = table.iloc[~valid_ind, :] - if mode == "dispersion": - ax = plt.scatter( - neg_disp_table["mean_expression"], - neg_disp_table["dispersion_empirical"], - s=3, - alpha=0.5, - color="xkcd:grey", - ) - elif "_dispersion" in mode: + if "_dispersion" in mode: ax = plt.scatter( neg_disp_table[prefix + "log_m"], neg_disp_table[prefix + "log_cv"], @@ -770,9 +734,6 @@ def feature_genes( color="xkcd:grey", ) - # plt.xlim((0, 100)) - if mode == "dispersion": - plt.xscale("log") plt.yscale("log") plt.xlabel("Mean (log)") plt.ylabel("Dispersion (log)") if mode == "dispersion" else plt.ylabel("CV (log)") diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index c2c17cf53..e541f4ab8 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -24,8 +24,8 @@ ) from .cell_cycle import cell_cycle_scores from .gene_selection import select_genes_by_seurat_recipe, select_genes_monocle -from .preprocess import normalize_cell_expr_by_size_factors_legacy, pca -from .preprocessor_utils import _infer_labeling_experiment_type +from .preprocess import pca +from .preprocessor_utils import _infer_labeling_experiment_type, calc_sz_factor from .preprocessor_utils import ( filter_cells_by_outliers as monocle_filter_cells_by_outliers, ) @@ -49,7 +49,7 @@ class Preprocessor: def __init__( self, - collapse_speicies_adata_function: Callable = collapse_species_adata, + collapse_species_adata_function: Callable = collapse_species_adata, convert_gene_name_function: Callable = convert2symbol, filter_cells_by_outliers_function: Callable = monocle_filter_cells_by_outliers, filter_cells_by_outliers_kwargs: dict = {}, @@ -57,6 +57,8 @@ def __init__( filter_genes_by_outliers_kwargs: dict = {}, normalize_by_cells_function: Callable = normalize_cell_expr_by_size_factors, normalize_by_cells_function_kwargs: dict = {}, + size_factor_function: Callable = calc_sz_factor, + size_factor_kwargs: dict = {}, select_genes_function: Callable = select_genes_monocle, select_genes_kwargs: dict = {}, normalize_selected_genes_function: Callable = None, @@ -74,13 +76,12 @@ def __init__( """Preprocessor constructor. The default preprocess functions are those of monocle recipe by default. - You can pass your own Callable objects (functions) to this constructor - directly, which wil be used in the preprocess steps later. These - functions parameters are saved into Preprocessor instances. You can set - these attributes directly to your own implementation. + You can pass your own Callable objects (functions) to this constructor directly, which wil be used in + the preprocess steps later. These functions parameters are saved into Preprocessor instances. + You can set these attributes directly to your own implementation. Args: - collapse_speicies_adata_function: function for collapsing the species data. Defaults to + collapse_species_adata_function: function for collapsing the species data. Defaults to collapse_species_adata. convert_gene_name_function: transform gene names, by default convert2symbol, which transforms unofficial gene names to official gene names. Defaults to convert2symbol. @@ -100,10 +101,10 @@ def __init__( log1p_kwargs: arguments passed to use_log1p. Defaults to {}. pca_function: function to perform pca. Defaults to pca in utils.py. pca_kwargs: arguments that will be passed pca. Defaults to {}. - gene_append_list: ensure that a list of genes show up in selected genes in monocle recipe pipeline. + gene_append_list: ensure that a list of genes show up in selected genes across all the recipe pipeline. Defaults to []. - gene_exclude_list: exclude a list of genes in monocle recipe pipeline. Defaults to []. - force_gene_list: use this gene list as selected genes in monocle recipe pipeline. Defaults to None. + gene_exclude_list: exclude a list of genes across all the recipe pipeline. Defaults to []. + force_gene_list: use this gene list as selected genes across all the recipe pipeline. Defaults to None. sctransform_kwargs: arguments passed into sctransform function. Defaults to {}. """ @@ -116,6 +117,7 @@ def __init__( self.filter_cells_by_outliers = filter_cells_by_outliers_function self.filter_genes_by_outliers = filter_genes_by_outliers_function self.normalize_by_cells = normalize_by_cells_function + self.calc_size_factor = size_factor_function self.select_genes = select_genes_function self.normalize_selected_genes = normalize_selected_genes_function self.use_log1p = use_log1p @@ -126,7 +128,7 @@ def __init__( # self.n_top_genes = n_top_genes self.convert_gene_name = convert_gene_name_function - self.collapse_species_adata = collapse_speicies_adata_function + self.collapse_species_adata = collapse_species_adata_function self.gene_append_list = gene_append_list self.gene_exclude_list = gene_exclude_list self.force_gene_list = force_gene_list @@ -135,6 +137,7 @@ def __init__( self.filter_genes_by_outliers_kwargs = filter_genes_by_outliers_kwargs self.normalize_by_cells_function_kwargs = normalize_by_cells_function_kwargs self.filter_cells_by_outliers_kwargs = filter_cells_by_outliers_kwargs + self.size_factor_kwargs = size_factor_kwargs self.select_genes_kwargs = select_genes_kwargs self.sctransform_kwargs = sctransform_kwargs self.normalize_selected_genes_kwargs = normalize_selected_genes_kwargs @@ -271,8 +274,7 @@ def _filter_cells_by_outliers(self, adata: AnnData) -> None: self.filter_cells_by_outliers(adata, **self.filter_cells_by_outliers_kwargs) def _filter_genes_by_outliers(self, adata: AnnData) -> None: - """Select valid genes based on the method specified as the - preprocessor's `filter_genes_by_outliers`. + """Select valid genes based on the method specified as the preprocessor's `filter_genes_by_outliers`. Args: adata: an AnnData object. @@ -283,9 +285,25 @@ def _filter_genes_by_outliers(self, adata: AnnData) -> None: main_debug("gene filter kwargs:" + str(self.filter_genes_by_outliers_kwargs)) self.filter_genes_by_outliers(adata, **self.filter_genes_by_outliers_kwargs) + def _calc_size_factor(self, adata: AnnData) -> None: + """Calculate the size factor of each cell based on method specified as the preprocessor's `calc_size_factor`. + + Args: + adata: an AnnData object. + """ + + if self.calc_size_factor: + main_debug("size factor calculation...") + main_debug("size_factor_kwargs kwargs:" + str(self.size_factor_kwargs)) + self.calc_size_factor( + adata, + total_layers=adata.uns["pp"]["experiment_total_layers"], + layers=adata.uns["pp"]["experiment_layers"], + **self.size_factor_kwargs + ) + def _select_genes(self, adata: AnnData) -> None: - """selecting gene by features, based on method specified as the - preprocessor's `select_genes`. + """selecting gene by features, based on method specified as the preprocessor's `select_genes`. Args: adata: an AnnData object. @@ -297,8 +315,7 @@ def _select_genes(self, adata: AnnData) -> None: self.select_genes(adata, **self.select_genes_kwargs) def _append_gene_list(self, adata: AnnData) -> None: - """Add genes to the feature gene list detected by the preprocessing - steps. + """Add genes to the feature gene list detected by the preprocessing steps. Args: adata: an AnnData object. @@ -310,8 +327,7 @@ def _append_gene_list(self, adata: AnnData) -> None: main_info("appended %d extra genes as required..." % len(append_genes)) def _exclude_gene_list(self, adata: AnnData) -> None: - """Remove genes from the feature gene list detected by the preprocessing - steps. + """Remove genes from the feature gene list detected by the preprocessing steps. Args: adata: an AnnData object. @@ -434,10 +450,7 @@ def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None Args: adata: an AnnData object. - n_top_genes: Number of top feature genes to select in the - preprocessing step. Defaults to 2000. - gene_selection_method: Which sorting method to be used to select - genes. Defaults to "SVR". + n_top_genes: Number of top feature genes to select in the preprocessing step. Defaults to 2000. """ n_obs, n_genes = adata.n_obs, adata.n_vars @@ -478,7 +491,6 @@ def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None "keep_filtered": True, "SVRs_kwargs": { "relative_expr": True, - "total_szfactor": "total_Size_Factor", "min_expr_cells": 0, "min_expr_avg": 0, "max_expr_avg": np.inf, @@ -525,15 +537,17 @@ def preprocess_adata_monocle( self._filter_cells_by_outliers(adata) self._filter_genes_by_outliers(adata) + # The following size factor calculation is a prerequisite for monocle recipe preprocess in preprocessor. + self._calc_size_factor(adata) + self._select_genes(adata) - # gene selection has been completed above. Now we need to append/delete/force selected gene list required by users. + # append/delete/force selected gene list required by users. self._append_gene_list(adata) self._exclude_gene_list(adata) self._force_gene_list(adata) - self._normalize_selected_genes(adata) - self._normalize_by_cells(adata) + # self._normalize_by_cells(adata) self._log1p(adata) self._pca(adata) @@ -586,6 +600,12 @@ def preprocess_adata_seurat( self._filter_genes_by_outliers(adata) self._normalize_by_cells(adata) self._select_genes(adata) + + # append/delete/force selected gene list required by users. + self._append_gene_list(adata) + self._exclude_gene_list(adata) + self._force_gene_list(adata) + self._log1p(adata) self._pca(adata) temp_logger.finish_progress(progress_name="Preprocessor-seurat") @@ -640,6 +660,12 @@ def preprocess_adata_sctransform( self._select_genes(adata) # TODO: if inplace in select_genes is True, the following subset is unnecessary. adata._inplace_subset_var(adata.var["use_for_pca"]) + + # append/delete/force selected gene list required by users. + self._append_gene_list(adata) + self._exclude_gene_list(adata) + self._force_gene_list(adata) + self.sctransform(adata, **self.sctransform_kwargs) self._pca(adata) @@ -687,6 +713,11 @@ def preprocess_adata_pearson_residuals( self.standardize_adata(adata, tkey, experiment_type) self._select_genes(adata) + # append/delete/force selected gene list required by users. + self._append_gene_list(adata) + self._exclude_gene_list(adata) + self._force_gene_list(adata) + self._normalize_selected_genes(adata) self._pca(adata) @@ -742,6 +773,12 @@ def preprocess_adata_monocle_pearson_residuals( temp_logger.log_time() self.standardize_adata(adata, tkey, experiment_type) self._select_genes(adata) + + # append/delete/force selected gene list required by users. + self._append_gene_list(adata) + self._exclude_gene_list(adata) + self._force_gene_list(adata) + X_copy = adata.X.copy() self._normalize_by_cells(adata) adata.X = X_copy diff --git a/dynamo/preprocessing/__init__.py b/dynamo/preprocessing/__init__.py index 4b28794f0..0c82504f2 100755 --- a/dynamo/preprocessing/__init__.py +++ b/dynamo/preprocessing/__init__.py @@ -35,12 +35,7 @@ normalize_cells = normalize_cell_expr_by_size_factors from .CnmfPreprocessor import CnmfPreprocessor -from .gene_selection import ( - Gini, - select_genes_by_svr, - select_genes_monocle, - top_table, -) +from .gene_selection import Gini, select_genes_by_svr, select_genes_monocle from .Preprocessor import Preprocessor __all__ = [ @@ -54,7 +49,6 @@ "recipe_monocle", "recipe_velocyto", "Gini", - "top_table", "filter_cells_by_outliers", "select_genes_monocle", "filter_genes", diff --git a/dynamo/preprocessing/_deprecated.py b/dynamo/preprocessing/_deprecated.py index 3f5e4c801..68fcd5a35 100644 --- a/dynamo/preprocessing/_deprecated.py +++ b/dynamo/preprocessing/_deprecated.py @@ -1,17 +1,14 @@ -from typing import Dict, List, Optional, Tuple, Union -from anndata import AnnData +import re +from typing import Dict, List, Literal, Optional, Tuple + import numpy as np import pandas as pd import statsmodels.api as sm -import re -from ..configuration import DKM -from ..dynamo_logger import ( - LoggerManager, - main_debug, - main_info, - main_warning, -) +from anndata import AnnData from scipy.sparse import csr_matrix, issparse + +from ..configuration import DKM +from ..dynamo_logger import LoggerManager, main_debug, main_info, main_warning from .utils import cook_dist @@ -236,4 +233,56 @@ def ans(q): "coefs": coefs, } - return adata \ No newline at end of file + return adata + + +def _top_table(adata: AnnData, layer: str = "X", mode: Literal["dispersion", "gini"] = "dispersion") -> pd.DataFrame: + """Retrieve a table that contains gene names and other info whose dispersions/gini index are highest. + + This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). + + Get information of the top layer. + + Args: + adata: an AnnData object. + layer: the layer(s) that would be searched for. Defaults to "X". + mode: either "dispersion" or "gini", deciding whether dispersion data or gini data would be acquired. Defaults + to "dispersion". + + Raises: + KeyError: if mode is set to dispersion but there is no available dispersion model. + + Returns: + The data frame of the top layer with the gene_id, mean_expression, dispersion_fit and dispersion_empirical as + the columns. + """ + + layer = DKM.get_available_layer_keys(adata, layers=layer, include_protein=False)[0] + + if layer in ["X"]: + key = "dispFitInfo" + else: + key = layer + "_dispFitInfo" + + if mode == "dispersion": + if adata.uns[key] is None: + _estimate_dispersion(adata, layers=[layer]) + raise KeyError( + "Error: for adata.uns.key=%s, no dispersion model found. Please call estimate_dispersion() before calling this function" + % key + ) + + top_df = pd.DataFrame( + { + "gene_id": adata.uns[key]["disp_table"]["gene_id"], + "mean_expression": adata.uns[key]["disp_table"]["mu"], + "dispersion_fit": adata.uns[key]["disp_func"](adata.uns[key]["disp_table"]["mu"]), + "dispersion_empirical": adata.uns[key]["disp_table"]["disp"], + } + ) + top_df = top_df.set_index("gene_id") + + elif mode == "gini": + top_df = adata.var[layer + "_gini"] + + return top_df diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 6c4762e94..47a1fc607 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -22,9 +22,7 @@ main_info_insert_adata_var, main_warning, ) -from ._deprecated import _estimate_dispersion from .preprocessor_utils import ( - calc_sz_factor, get_nan_or_inf_data_bool_mask, get_svr_filter, seurat_get_mean_var, @@ -93,59 +91,6 @@ def compute_gini(CM): return adata -def top_table(adata: AnnData, layer: str = "X", mode: Literal["dispersion", "gini"] = "dispersion") -> pd.DataFrame: - """Retrieve a table that contains gene names and other info whose dispersions/gini index are highest. - - This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). - - Get information of the top layer. - - Args: - adata: an AnnData object. - layer: the layer(s) that would be searched for. Defaults to "X". - mode: either "dispersion" or "gini", deciding whether dispersion data or gini data would be acquired. Defaults - to "dispersion". - - Raises: - KeyError: if mode is set to dispersion but there is no available dispersion model. - - Returns: - The data frame of the top layer with the gene_id, mean_expression, dispersion_fit and dispersion_empirical as - the columns. - """ - - layer = DKM.get_available_layer_keys(adata, layers=layer, include_protein=False)[0] - - if layer in ["X"]: - key = "dispFitInfo" - else: - key = layer + "_dispFitInfo" - - if mode == "dispersion": - if adata.uns[key] is None: - main_warning("dispersion mode is deprecated. This mode will be removed in the future.") - _estimate_dispersion(adata, layers=[layer]) - raise KeyError( - "Error: for adata.uns.key=%s, no dispersion model found. Please call estimate_dispersion() before calling this function" - % key - ) - - top_df = pd.DataFrame( - { - "gene_id": adata.uns[key]["disp_table"]["gene_id"], - "mean_expression": adata.uns[key]["disp_table"]["mu"], - "dispersion_fit": adata.uns[key]["disp_func"](adata.uns[key]["disp_table"]["mu"]), - "dispersion_empirical": adata.uns[key]["disp_table"]["disp"], - } - ) - top_df = top_df.set_index("gene_id") - - elif mode == "gini": - top_df = adata.var[layer + "_gini"] - - return top_df - - def select_genes_monocle( adata: AnnData, layer: str = "X", @@ -173,22 +118,10 @@ def select_genes_monocle( genes_to_exclude: genes that are excluded from evaluation. Defaults to None. SVRs_kwargs: kwargs for `SVRs`. Defaults to {}. - Returns: - The adata object with genes updated if `only_bools` is false. Otherwise, the bool array representing selected - genes. + Raises: + NotImplementedError: the 'sort_by' algorithm is invalid/unsupported. """ - # The following size factor calculation is a prerequisite for monocle recipe preprocess in preprocessor. - adata = calc_sz_factor( - adata, - total_layers=adata.uns["pp"]["experiment_total_layers"], - scale_to=None, - splicing_total_layers=False, - X_total_layers=False, - layers=adata.uns["pp"]["experiment_layers"], - genes_use_for_norm=None, - ) - filter_bool = ( adata.var["pass_basic_filter"] if "pass_basic_filter" in adata.var.columns @@ -215,10 +148,11 @@ def select_genes_monocle( ) filter_bool = get_svr_filter(adata, layer=layer, n_top_genes=n_top_genes, return_adata=False) else: - raise ValueError(f"The algorithm {sort_by} is not existed") + raise NotImplementedError(f"The algorithm {sort_by} is invalid/unsupported") # filter genes by gene expression fraction as well - adata.var["frac"], invalid_ids = compute_gene_exp_fraction(X=adata.X, threshold=exprs_frac_for_gene_exclusion) + if "frac" not in adata.var.keys(): + adata.var["frac"], invalid_ids = compute_gene_exp_fraction(X=adata.X, threshold=exprs_frac_for_gene_exclusion) genes_to_exclude = ( list(adata.var_names[invalid_ids]) if genes_to_exclude is None diff --git a/dynamo/preprocessing/preprocess.py b/dynamo/preprocessing/preprocess.py index 7b7b5b4ca..097a60756 100755 --- a/dynamo/preprocessing/preprocess.py +++ b/dynamo/preprocessing/preprocess.py @@ -26,8 +26,9 @@ ) from ..tools.utils import update_dict from ..utils import copy_adata +from ._deprecated import _top_table from .cell_cycle import cell_cycle_scores -from .gene_selection import select_genes_by_svr, top_table +from .gene_selection import select_genes_by_svr from .preprocessor_utils import ( _infer_labeling_experiment_type, filter_cells_by_outliers, @@ -308,93 +309,6 @@ def normalize_cell_expr_by_size_factors_legacy( return adata -def disp_calc_helper_NB( - adata: anndata.AnnData, layers: str = "X", min_cells_detected: int = 1 -) -> Tuple[List[str], List[pd.DataFrame]]: - """Calculate the dispersion parameter of the negative binomial distribution. - - This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3). - - Args: - adata: an adata object - layers: the layer of data used for dispersion fitting. Defaults to "X". - min_cells_detected: the minimal required number of cells with expression for selecting gene for dispersion - fitting. Defaults to 1. - - Returns: - A tuple (layers, res_list), where layers is a list of layers available and res_list is a list of pd.DataFrame's - with mu, dispersion for each gene that passes filters. - """ - - layers = DynamoAdataKeyManager.get_available_layer_keys(adata, layers=layers, include_protein=False) - - res_list = [] - for layer in layers: - if layer == "raw": - CM = adata.raw.X - szfactors = adata.obs[layer + "Size_Factor"][:, None] - elif layer == "X": - CM = adata.X - szfactors = adata.obs["Size_Factor"][:, None] - else: - CM = adata.layers[layer] - szfactors = adata.obs[layer + "Size_Factor"][:, None] - - if issparse(CM): - CM.data = np.round(CM.data, 0) - rounded = CM - else: - rounded = CM.round().astype("int") - - lowerDetectedLimit = adata.uns["lowerDetectedLimit"] if "lowerDetectedLimit" in adata.uns.keys() else 1 - nzGenes = (rounded > lowerDetectedLimit).sum(axis=0) - nzGenes = nzGenes > min_cells_detected - - nzGenes = nzGenes.A1 if issparse(rounded) else nzGenes - if layer.startswith("X_"): - x = rounded[:, nzGenes] - else: - x = ( - rounded[:, nzGenes].multiply(csr_matrix(1 / szfactors)) - if issparse(rounded) - else rounded[:, nzGenes] / szfactors - ) - - xim = np.mean(1 / szfactors) if szfactors is not None else 1 - - f_expression_mean = x.mean(axis=0) - - # For NB: Var(Y) = mu * (1 + mu / k) - # x.A.var(axis=0, ddof=1) - f_expression_var = ( - (x.multiply(x).mean(0).A1 - f_expression_mean.A1**2) * x.shape[0] / (x.shape[0] - 1) - if issparse(x) - else x.var(axis=0, ddof=0) ** 2 - ) # np.mean(np.power(x - f_expression_mean, 2), axis=0) # variance with n - 1 - # https://scialert.net/fulltext/?doi=ajms.2010.1.15 method of moments - disp_guess_meth_moments = f_expression_var - xim * f_expression_mean # variance - mu - - disp_guess_meth_moments = disp_guess_meth_moments / np.power( - f_expression_mean, 2 - ) # this is dispersion parameter (1/k) - - res = pd.DataFrame( - { - "mu": np.array(f_expression_mean).flatten(), - "disp": np.array(disp_guess_meth_moments).flatten(), - } - ) - res.loc[res["mu"] == 0, "mu"] = None - res.loc[res["mu"] == 0, "disp"] = None - res.loc[res["disp"] < 0, "disp"] = 0 - - res["gene_id"] = adata.var_names[nzGenes] - - res_list.append(res) - - return layers, res_list - - def vstExprs( adata: anndata.AnnData, expr_matrix: Union[np.ndarray, None] = None, @@ -1561,7 +1475,7 @@ def select_genes_monocle_legacy( filter_bool = np.ones(adata.shape[1], dtype=bool) else: if sort_by == "dispersion": - table = top_table(adata, layer, mode="dispersion") + table = _top_table(adata, layer, mode="dispersion") valid_table = table.query("dispersion_empirical > dispersion_fit") valid_table = valid_table.loc[ set(adata.var.index[filter_bool]).intersection(valid_table.index), @@ -1571,7 +1485,7 @@ def select_genes_monocle_legacy( gene_id = valid_table.iloc[gene_id, :].index filter_bool = adata.var.index.isin(gene_id) elif sort_by == "gini": - table = top_table(adata, layer, mode="gini") + table = _top_table(adata, layer, mode="gini") valid_table = table.loc[filter_bool, :] gene_id = np.argsort(-valid_table.loc[:, "gini"])[:n_top_genes] gene_id = valid_table.index[gene_id] From 0e6b0c87000f5c90c867a3e6e86793473bd42c40 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Fri, 21 Apr 2023 17:24:54 -0400 Subject: [PATCH 19/28] Fix issues and reorder the normalization steps --- dynamo/plot/preprocess.py | 2 +- dynamo/preprocessing/Preprocessor.py | 16 ++++------ dynamo/preprocessing/gene_selection.py | 37 +++++++++++----------- dynamo/preprocessing/preprocessor_utils.py | 15 +++++---- dynamo/preprocessing/utils.py | 12 +++---- 5 files changed, 41 insertions(+), 41 deletions(-) diff --git a/dynamo/plot/preprocess.py b/dynamo/plot/preprocess.py index bf24d85e2..eb983cf1f 100755 --- a/dynamo/plot/preprocess.py +++ b/dynamo/plot/preprocess.py @@ -702,7 +702,7 @@ def feature_genes( mean = adata.uns[uns_store_key]["mean"] cv = adata.uns[uns_store_key]["cv"] svr_gamma = adata.uns[uns_store_key]["svr_gamma"] - fit = get_prediction_by_svr(mean, cv, svr_gamma) + fit, _ = get_prediction_by_svr(mean, cv, svr_gamma) fit = fit(mu_linspace.reshape(-1, 1)) plt.figure(figsize=figsize) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index e541f4ab8..7baee3bae 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -455,7 +455,6 @@ def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None n_obs, n_genes = adata.n_obs, adata.n_vars n_cells = n_obs - self.use_log1p = False self.filter_cells_by_outliers = monocle_filter_cells_by_outliers self.filter_cells_by_outliers_kwargs = { "filter_bool": None, @@ -502,9 +501,7 @@ def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None } self.normalize_selected_genes = None self.normalize_by_cells = normalize_cell_expr_by_size_factors - - # recipe monocle log1p all raw data in normalize_by_cells (dynamo version), so we do not need extra log1p transform. - self.use_log1p = False + self.use_log1p = True self.pca = pca self.pca_kwargs = {"pca_key": "X_pca"} @@ -534,12 +531,12 @@ def preprocess_adata_monocle( temp_logger.log_time() self.standardize_adata(adata, tkey, experiment_type) - self._filter_cells_by_outliers(adata) self._filter_genes_by_outliers(adata) + # The following size factor calculation is a prerequisite for monocle recipe preprocess in preprocessor. self._calc_size_factor(adata) - + self._normalize_by_cells(adata) self._select_genes(adata) # append/delete/force selected gene list required by users. @@ -547,8 +544,6 @@ def preprocess_adata_monocle( self._exclude_gene_list(adata) self._force_gene_list(adata) - # self._normalize_by_cells(adata) - self._log1p(adata) self._pca(adata) self._cell_cycle_score(adata) @@ -598,6 +593,7 @@ def preprocess_adata_seurat( self.standardize_adata(adata, tkey, experiment_type) self._filter_genes_by_outliers(adata) + self._calc_size_factor(adata) self._normalize_by_cells(adata) self._select_genes(adata) @@ -654,9 +650,11 @@ def preprocess_adata_sctransform( main_info("Running Sctransform recipe preprocessing...") self.standardize_adata(adata, tkey, experiment_type) - self._filter_cells_by_outliers(adata) self._filter_genes_by_outliers(adata) + + self._calc_size_factor(adata) + self._normalize_by_cells(adata) self._select_genes(adata) # TODO: if inplace in select_genes is True, the following subset is unnecessary. adata._inplace_subset_var(adata.var["use_for_pca"]) diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 47a1fc607..0d3b9f1f8 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -93,7 +93,7 @@ def compute_gini(CM): def select_genes_monocle( adata: AnnData, - layer: str = "X", + layer: str = DKM.X_LAYER, keep_filtered: bool = True, n_top_genes: int = 2000, sort_by: Literal["gini", "cv_dispersion", "fano_dispersion"] = "cv_dispersion", @@ -139,13 +139,14 @@ def select_genes_monocle( feature_gene_idx = valid_table.index[feature_gene_idx] filter_bool = filter_bool.index.isin(feature_gene_idx) elif sort_by == "cv_dispersion" or sort_by == "fano_dispersion": - adata = select_genes_by_svr( - adata, - layers=[layer], - filter_bool=filter_bool, - algorithm=sort_by, - **SVRs_kwargs, - ) + if not any("velocyto_SVR" in key for key in adata.uns.keys()): + select_genes_by_svr( + adata, + layers=layer, + filter_bool=filter_bool, + algorithm=sort_by, + **SVRs_kwargs, + ) filter_bool = get_svr_filter(adata, layer=layer, n_top_genes=n_top_genes, return_adata=False) else: raise NotImplementedError(f"The algorithm {sort_by} is invalid/unsupported") @@ -153,6 +154,7 @@ def select_genes_monocle( # filter genes by gene expression fraction as well if "frac" not in adata.var.keys(): adata.var["frac"], invalid_ids = compute_gene_exp_fraction(X=adata.X, threshold=exprs_frac_for_gene_exclusion) + genes_to_exclude = ( list(adata.var_names[invalid_ids]) if genes_to_exclude is None @@ -229,7 +231,7 @@ def select_genes_by_svr( continue mean, cv = get_mean_cv(valid_CM, algorithm, winsorize, winsor_perc) - fitted_fun = get_prediction_by_svr(mean, cv, svr_gamma) + fitted_fun, svr_gamma = get_prediction_by_svr(mean, cv, svr_gamma) score = cv - fitted_fun(mean) if sort_inverse: score = -score @@ -317,13 +319,13 @@ def get_vaild_CM( if total_szfactor is not None and total_szfactor in adata.obs.keys(): szfactors = adata.obs[total_szfactor].values[:, None] if total_szfactor in adata.obs.columns else None - if szfactors is not None and relative_expr: - if issparse(CM): - from sklearn.utils import sparsefuncs - - sparsefuncs.inplace_row_scale(CM, 1 / szfactors) - else: - CM /= szfactors + # if szfactors is not None and relative_expr: + # if issparse(CM): + # from sklearn.utils import sparsefuncs + # + # sparsefuncs.inplace_row_scale(CM, 1 / szfactors) + # else: + # CM /= szfactors if winsorize: if min_expr_cells <= ((100 - winsor_perc[1]) * CM.shape[0] * 0.01): @@ -359,7 +361,6 @@ def get_mean_cv( (gene_counts_stats, gene_fano_parameters) = get_highvar_genes_sparse(valid_CM) mean = np.array(gene_counts_stats["mean"]).flatten()[:, None] cv = np.array(gene_counts_stats["fano"]).flatten() - mu = gene_counts_stats["mean"] return mean, cv elif algorithm == "cv_dispersion": if winsorize: @@ -418,7 +419,7 @@ def get_prediction_by_svr(ground: np.ndarray, target: np.ndarray, svr_gamma: Opt # Fit the Support Vector Regression clf = SVR(gamma=svr_gamma) clf.fit(ground, target) - return clf.predict + return clf.predict, svr_gamma # Highly variable gene selection function: diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index 07062e2a1..ab7dac0fc 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -66,7 +66,7 @@ def is_log1p_transformed_adata(adata: anndata.AnnData) -> bool: chosen_gene_indices = np.random.choice(adata.n_vars, 10) _has_log1p_transformed = not np.allclose( np.array(adata.X[:, chosen_gene_indices].sum(1)), - np.array(adata.layers["spliced"][:, chosen_gene_indices].sum(1)), + np.array(adata.layers["X_spliced"][:, chosen_gene_indices].sum(1)), atol=1e-4, ) return _has_log1p_transformed @@ -793,10 +793,8 @@ def normalize_cell_expr_by_size_factors( layer_sz_column_names.extend(["Size_Factor"]) # layers_to_sz = list(set(layer_sz_column_names).difference(adata.obs.keys())) layers_to_sz = list(set(layer_sz_column_names)) - if len(layers_to_sz) > 0: - layers = pd.Series(layers_to_sz).str.split("_Size_Factor", expand=True).iloc[:, 0].tolist() - if "Size_Factor" in layers: - layers[np.where(np.array(layers) == "Size_Factor")[0][0]] = "X" + + if not all(key in adata.obs.keys() for key in layers_to_sz): calc_sz_factor( adata, layers=layers, @@ -806,13 +804,16 @@ def normalize_cell_expr_by_size_factors( scale_to=scale_to, ) + layers = pd.Series(layers_to_sz).str.split("_Size_Factor", expand=True).iloc[:, 0].tolist() + layers[np.where(np.array(layers) == "Size_Factor")[0][0]] = "X" + excluded_layers = [] if not X_total_layers: excluded_layers.extend(["X"]) if not splicing_total_layers: excluded_layers.extend(["spliced", "unspliced"]) - main_info("size factor normalize following layers: " + str(layers)) + main_debug("size factor normalize following layers: " + str(layers)) for layer in layers: if layer in excluded_layers: szfactors, CM = get_sz_exprs(adata, layer, total_szfactor=None) @@ -859,7 +860,7 @@ def normalize_cell_expr_by_size_factors( main_warning(_norm_method + " is not implemented yet") if layer in ["raw", "X"]: - main_info("set adata to normalized data using %s" % _norm_method) + main_debug("set adata to normalized data using %s" % _norm_method) adata.X = CM elif layer == "protein" and "protein" in adata.obsm_keys(): main_info_insert_adata_obsm("X_protein") diff --git a/dynamo/preprocessing/utils.py b/dynamo/preprocessing/utils.py index 5695e0794..a62e95244 100755 --- a/dynamo/preprocessing/utils.py +++ b/dynamo/preprocessing/utils.py @@ -707,12 +707,12 @@ def normalize_mat_monocle( if pseudo_expr is None: pseudo_expr = 1 - if issparse(mat): - mat.data = norm_method(mat.data + pseudo_expr) if norm_method is not None else mat.data - if norm_method is not None and norm_method.__name__ == "Freeman_Tukey": - mat.data -= 1 - else: - mat = norm_method(mat + pseudo_expr) if norm_method is not None else mat + # if issparse(mat): + # mat.data = norm_method(mat.data + pseudo_expr) if norm_method is not None else mat.data + # if norm_method is not None and norm_method.__name__ == "Freeman_Tukey": + # mat.data -= 1 + # else: + # mat = norm_method(mat + pseudo_expr) if norm_method is not None else mat return mat From 8fd56dcc442a71ada1a47c2f2f78eb92e9e4a31d Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Mon, 24 Apr 2023 16:26:52 -0400 Subject: [PATCH 20/28] Revert back to the previous functions and fix issues --- dynamo/dynamo_logger.py | 3 +-- dynamo/preprocessing/Preprocessor.py | 25 ++++------------------ dynamo/preprocessing/gene_selection.py | 14 ++++++------ dynamo/preprocessing/preprocessor_utils.py | 4 ++-- dynamo/preprocessing/utils.py | 12 +++++------ 5 files changed, 20 insertions(+), 38 deletions(-) diff --git a/dynamo/dynamo_logger.py b/dynamo/dynamo_logger.py index d0ebbb1b0..52002c63e 100644 --- a/dynamo/dynamo_logger.py +++ b/dynamo/dynamo_logger.py @@ -189,11 +189,10 @@ def report_progress(self, percent=None, count=None, total=None, progress_name="" def finish_progress(self, progress_name="", time_unit="s", indent_level=1): self.log_time() - #self.report_progress(percent=100, progress_name=progress_name) + # self.report_progress(percent=100, progress_name=progress_name) saved_terminator = self.logger_stream_handler.terminator self.logger_stream_handler.terminator = "" - #self.logger.info("\n") self.logger_stream_handler.flush() self.logger_stream_handler.terminator = saved_terminator diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index 7baee3bae..09c686f16 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -386,12 +386,6 @@ def _log1p(self, adata: AnnData) -> None: """ if self.use_log1p: - if is_log1p_transformed_adata(adata): - main_warning( - "Your adata.X maybe log1p transformed before. If you are sure that your adata is not log1p transformed, please ignore this warning. Dynamo will do log1p transformation still." - ) - # TODO: the following line is for monocle recipe and later dynamics matrix recovery - # refactor with dynamics module adata.uns["pp"]["norm_method"] = "log1p" main_debug("applying log1p transformation on expression matrix data (adata.X)...") self.log1p(adata, **self.log1p_kwargs) @@ -484,23 +478,10 @@ def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None "shared_count": 30, } self.select_genes = select_genes_monocle - self.select_genes_kwargs = { - "n_top_genes": n_top_genes, - "sort_by": "cv_dispersion", - "keep_filtered": True, - "SVRs_kwargs": { - "relative_expr": True, - "min_expr_cells": 0, - "min_expr_avg": 0, - "max_expr_avg": np.inf, - "winsorize": False, - "winsor_perc": (1, 99.5), - "sort_inverse": False, - "svr_gamma": None, - }, - } + self.select_genes_kwargs = {"n_top_genes": n_top_genes, "SVRs_kwargs": {"relative_expr": False}} self.normalize_selected_genes = None self.normalize_by_cells = normalize_cell_expr_by_size_factors + self.normalize_by_cells_function_kwargs = {"skip_log": True} self.use_log1p = True self.pca = pca self.pca_kwargs = {"pca_key": "X_pca"} @@ -626,6 +607,7 @@ def config_sctransform_recipe(self, adata: AnnData) -> None: } self.select_genes = select_genes_by_seurat_recipe self.select_genes_kwargs = {"inplace": True} + self.normalize_by_cells_function_kwargs = {"skip_log": True} self.sctransform_kwargs = {"layers": raw_layers, "n_top_genes": 2000} self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50} @@ -738,6 +720,7 @@ def config_monocle_pearson_residuals_recipe(self, adata: AnnData) -> None: # self.filter_cells_by_outliers = None # self.filter_genes_by_outliers = None self.normalize_by_cells = normalize_cell_expr_by_size_factors + self.normalize_by_cells_function_kwargs = {"skip_log": True} self.select_genes = select_genes_by_pearson_residuals self.select_genes_kwargs = {"n_top_genes": 2000} self.normalize_selected_genes = normalize_layers_pearson_residuals diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 0d3b9f1f8..7168fd304 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -319,13 +319,13 @@ def get_vaild_CM( if total_szfactor is not None and total_szfactor in adata.obs.keys(): szfactors = adata.obs[total_szfactor].values[:, None] if total_szfactor in adata.obs.columns else None - # if szfactors is not None and relative_expr: - # if issparse(CM): - # from sklearn.utils import sparsefuncs - # - # sparsefuncs.inplace_row_scale(CM, 1 / szfactors) - # else: - # CM /= szfactors + if szfactors is not None and relative_expr: + if issparse(CM): + from sklearn.utils import sparsefuncs + + sparsefuncs.inplace_row_scale(CM, 1 / szfactors) + else: + CM /= szfactors if winsorize: if min_expr_cells <= ((100 - winsor_perc[1]) * CM.shape[0] * 0.01): diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index ab7dac0fc..b5bee5804 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -66,7 +66,7 @@ def is_log1p_transformed_adata(adata: anndata.AnnData) -> bool: chosen_gene_indices = np.random.choice(adata.n_vars, 10) _has_log1p_transformed = not np.allclose( np.array(adata.X[:, chosen_gene_indices].sum(1)), - np.array(adata.layers["X_spliced"][:, chosen_gene_indices].sum(1)), + np.array(adata.layers["spliced"][:, chosen_gene_indices].sum(1)), atol=1e-4, ) return _has_log1p_transformed @@ -829,7 +829,7 @@ def normalize_cell_expr_by_size_factors( _norm_method = norm_method if skip_log: - main_info("skipping log transformation as input requires...") + main_debug("skipping log transformation as input requires...") _norm_method = None if _norm_method in [np.log1p, np.log, np.log2, Freeman_Tukey, None] and layer != "protein": diff --git a/dynamo/preprocessing/utils.py b/dynamo/preprocessing/utils.py index a62e95244..5695e0794 100755 --- a/dynamo/preprocessing/utils.py +++ b/dynamo/preprocessing/utils.py @@ -707,12 +707,12 @@ def normalize_mat_monocle( if pseudo_expr is None: pseudo_expr = 1 - # if issparse(mat): - # mat.data = norm_method(mat.data + pseudo_expr) if norm_method is not None else mat.data - # if norm_method is not None and norm_method.__name__ == "Freeman_Tukey": - # mat.data -= 1 - # else: - # mat = norm_method(mat + pseudo_expr) if norm_method is not None else mat + if issparse(mat): + mat.data = norm_method(mat.data + pseudo_expr) if norm_method is not None else mat.data + if norm_method is not None and norm_method.__name__ == "Freeman_Tukey": + mat.data -= 1 + else: + mat = norm_method(mat + pseudo_expr) if norm_method is not None else mat return mat From 665f3b74dc70b5de743f75c6b12a47a579dfca9d Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Mon, 24 Apr 2023 20:58:03 -0400 Subject: [PATCH 21/28] Refactoring normalize and log1p function --- dynamo/plot/preprocess.py | 2 +- dynamo/preprocessing/Preprocessor.py | 58 ++--- dynamo/preprocessing/__init__.py | 8 +- dynamo/preprocessing/preprocess.py | 2 +- dynamo/preprocessing/preprocessor_utils.py | 273 +++++++++++++++++---- dynamo/preprocessing/utils.py | 13 + dynamo/tools/clustering.py | 7 +- tests/test_preprocess.py | 4 +- 8 files changed, 272 insertions(+), 95 deletions(-) diff --git a/dynamo/plot/preprocess.py b/dynamo/plot/preprocess.py index eb983cf1f..1b31195a3 100755 --- a/dynamo/plot/preprocess.py +++ b/dynamo/plot/preprocess.py @@ -657,7 +657,7 @@ def feature_genes( layer: `str` (default: `X`) The data from a particular layer (include X) used for making the feature gene plot. mode: None or `str` (default: `None`) - The method to select the feature genes (can be either `dispersion`, `gini` or `SVR`). + The method to select the feature genes (can be either `cv_dispersion`, `fano_dispersion` or `gini`). figsize: `string` (default: (4, 3)) Figure size of each facet. save_show_or_return: {'show', 'save', 'return'} (default: `show`) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index 09c686f16..b48f69a17 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -32,11 +32,7 @@ from .preprocessor_utils import ( filter_genes_by_outliers as monocle_filter_genes_by_outliers, ) -from .preprocessor_utils import ( - is_log1p_transformed_adata, - log1p_adata, - normalize_cell_expr_by_size_factors, -) +from .preprocessor_utils import is_log1p_transformed_adata, log1p, normalize from .utils import ( collapse_species_adata, convert2symbol, @@ -55,7 +51,7 @@ def __init__( filter_cells_by_outliers_kwargs: dict = {}, filter_genes_by_outliers_function: Callable = monocle_filter_genes_by_outliers, filter_genes_by_outliers_kwargs: dict = {}, - normalize_by_cells_function: Callable = normalize_cell_expr_by_size_factors, + normalize_by_cells_function: Callable = normalize, normalize_by_cells_function_kwargs: dict = {}, size_factor_function: Callable = calc_sz_factor, size_factor_kwargs: dict = {}, @@ -63,8 +59,8 @@ def __init__( select_genes_kwargs: dict = {}, normalize_selected_genes_function: Callable = None, normalize_selected_genes_kwargs: dict = {}, - use_log1p: bool = True, - log1p_kwargs: dict = {}, + norm_method: bool = True, + norm_method_kwargs: dict = {}, pca_function: Callable = pca, pca_kwargs: dict = {}, gene_append_list: List[str] = [], @@ -97,8 +93,8 @@ def __init__( select_genes_kwargs: arguments that will be passed to select_genes. Defaults to {}. normalize_selected_genes_function: function for normalize selected genes. Defaults to None. normalize_selected_genes_kwargs: arguments that will be passed to normalize_selected_genes. Defaults to {}. - use_log1p: whether to use log1p to normalize layers in adata. Defaults to True. - log1p_kwargs: arguments passed to use_log1p. Defaults to {}. + norm_method: whether to use a method to normalize layers in adata. Defaults to True. + norm_method_kwargs: arguments passed to norm_method. Defaults to {}. pca_function: function to perform pca. Defaults to pca in utils.py. pca_kwargs: arguments that will be passed pca. Defaults to {}. gene_append_list: ensure that a list of genes show up in selected genes across all the recipe pipeline. @@ -110,8 +106,8 @@ def __init__( self.convert_layers2csr = convert_layers2csr self.unique_var_obs_adata = unique_var_obs_adata - self.log1p = log1p_adata - self.log1p_kwargs = log1p_kwargs + self.norm_method = log1p + self.norm_method_kwargs = norm_method_kwargs self.sctransform = sctransform self.filter_cells_by_outliers = filter_cells_by_outliers_function @@ -120,7 +116,6 @@ def __init__( self.calc_size_factor = size_factor_function self.select_genes = select_genes_function self.normalize_selected_genes = normalize_selected_genes_function - self.use_log1p = use_log1p self.pca = pca_function self.pca_kwargs = pca_kwargs @@ -377,18 +372,16 @@ def _normalize_by_cells(self, adata: AnnData) -> None: main_debug("applying normalize by cells function...") self.normalize_by_cells(adata, **self.normalize_by_cells_function_kwargs) - def _log1p(self, adata: AnnData) -> None: - """Perform log1p on the data with args specified in the preprocessor's - `log1p_kwargs`. + def _norm_method(self, adata: AnnData) -> None: + """Perform a normalization method on the data with args specified in the preprocessor's `norm_method_kwargs`. Args: adata: an AnnData object. """ - if self.use_log1p: - adata.uns["pp"]["norm_method"] = "log1p" - main_debug("applying log1p transformation on expression matrix data (adata.X)...") - self.log1p(adata, **self.log1p_kwargs) + if callable(self.norm_method): + main_debug("applying a normalization method transformation on expression matrix data...") + self.norm_method(adata, **self.norm_method_kwargs) def _pca(self, adata: AnnData) -> None: """Perform pca reduction with args specified in the preprocessor's `pca_kwargs`. @@ -435,7 +428,7 @@ def preprocess_adata_seurat_wo_pca( self._filter_genes_by_outliers(adata) self._normalize_by_cells(adata) self._select_genes(adata) - self._log1p(adata) + self._norm_method(adata) temp_logger.finish_progress(progress_name="preprocess by seurat wo pca recipe") @@ -480,9 +473,8 @@ def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None self.select_genes = select_genes_monocle self.select_genes_kwargs = {"n_top_genes": n_top_genes, "SVRs_kwargs": {"relative_expr": False}} self.normalize_selected_genes = None - self.normalize_by_cells = normalize_cell_expr_by_size_factors - self.normalize_by_cells_function_kwargs = {"skip_log": True} - self.use_log1p = True + self.normalize_by_cells = normalize + self.norm_method = log1p self.pca = pca self.pca_kwargs = {"pca_key": "X_pca"} @@ -525,7 +517,7 @@ def preprocess_adata_monocle( self._exclude_gene_list(adata) self._force_gene_list(adata) - self._log1p(adata) + self._norm_method(adata) self._pca(adata) self._cell_cycle_score(adata) @@ -544,11 +536,8 @@ def config_seurat_recipe(self, adata: AnnData) -> None: "algorithm": "seurat_dispersion", "n_top_genes": 2000, } - self.normalize_by_cells_function_kwargs = {"skip_log": True} self.pca_kwargs = {"pca_key": "X_pca"} self.filter_genes_by_outliers_kwargs = {"shared_count": 20} - self.use_log1p = True - self.log1p_kwargs = {"layers": ["X"]} def preprocess_adata_seurat( self, adata: AnnData, tkey: Optional[str] = None, experiment_type: Optional[str] = None @@ -583,7 +572,7 @@ def preprocess_adata_seurat( self._exclude_gene_list(adata) self._force_gene_list(adata) - self._log1p(adata) + self._norm_method(adata) self._pca(adata) temp_logger.finish_progress(progress_name="Preprocessor-seurat") @@ -595,7 +584,6 @@ def config_sctransform_recipe(self, adata: AnnData) -> None: adata: an AnnData object. """ - self.use_log1p = False raw_layers = DKM.get_raw_data_layers(adata) self.filter_cells_by_outliers_kwargs = {"keep_filtered": False} self.filter_genes_by_outliers_kwargs = { @@ -607,7 +595,6 @@ def config_sctransform_recipe(self, adata: AnnData) -> None: } self.select_genes = select_genes_by_seurat_recipe self.select_genes_kwargs = {"inplace": True} - self.normalize_by_cells_function_kwargs = {"skip_log": True} self.sctransform_kwargs = {"layers": raw_layers, "n_top_genes": 2000} self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50} @@ -635,8 +622,6 @@ def preprocess_adata_sctransform( self._filter_cells_by_outliers(adata) self._filter_genes_by_outliers(adata) - self._calc_size_factor(adata) - self._normalize_by_cells(adata) self._select_genes(adata) # TODO: if inplace in select_genes is True, the following subset is unnecessary. adata._inplace_subset_var(adata.var["use_for_pca"]) @@ -669,7 +654,6 @@ def config_pearson_residuals_recipe(self, adata: AnnData) -> None: normalize_layers = DKM.get_raw_data_layers(adata) self.normalize_selected_genes_kwargs = {"layers": normalize_layers, "copy": False} self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50} - self.use_log1p = False def preprocess_adata_pearson_residuals( self, adata: AnnData, tkey: Optional[str] = None, experiment_type: Optional[str] = None @@ -719,16 +703,12 @@ def config_monocle_pearson_residuals_recipe(self, adata: AnnData) -> None: self.config_monocle_recipe(adata) # self.filter_cells_by_outliers = None # self.filter_genes_by_outliers = None - self.normalize_by_cells = normalize_cell_expr_by_size_factors - self.normalize_by_cells_function_kwargs = {"skip_log": True} + self.normalize_by_cells = normalize self.select_genes = select_genes_by_pearson_residuals self.select_genes_kwargs = {"n_top_genes": 2000} self.normalize_selected_genes = normalize_layers_pearson_residuals - self.normalize_selected_genes_kwargs = {"layers": ["X"], "copy": False} - self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50} - self.use_log1p = False def preprocess_adata_monocle_pearson_residuals( self, adata: AnnData, tkey: Optional[str] = None, experiment_type: Optional[str] = None diff --git a/dynamo/preprocessing/__init__.py b/dynamo/preprocessing/__init__.py index 0c82504f2..ec0e14f93 100755 --- a/dynamo/preprocessing/__init__.py +++ b/dynamo/preprocessing/__init__.py @@ -31,8 +31,8 @@ filter_cells = filter_cells_by_outliers filter_genes = filter_genes_by_outliers -log1p = log1p_adata -normalize_cells = normalize_cell_expr_by_size_factors +log1p = log1p +normalize_cells = normalize from .CnmfPreprocessor import CnmfPreprocessor from .gene_selection import Gini, select_genes_by_svr, select_genes_monocle @@ -45,7 +45,7 @@ "normalize_cells", "lambda_correction", "calc_sz_factor_legacy", - "normalize_cell_expr_by_size_factors", + "normalize", "recipe_monocle", "recipe_velocyto", "Gini", @@ -70,6 +70,6 @@ "Preprocessor", "CnmfPreprocessor", "log1p", - "log1p_adata", + "log1p", "log1p_adata_layer", ] diff --git a/dynamo/preprocessing/preprocess.py b/dynamo/preprocessing/preprocess.py index 097a60756..595215371 100755 --- a/dynamo/preprocessing/preprocess.py +++ b/dynamo/preprocessing/preprocess.py @@ -33,7 +33,7 @@ _infer_labeling_experiment_type, filter_cells_by_outliers, filter_genes_by_outliers, - normalize_cell_expr_by_size_factors, + normalize, ) from .utils import ( Freeman_Tukey, diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index b5bee5804..ae9ed3d86 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -21,10 +21,10 @@ main_finish_progress, main_info, main_info_insert_adata, + main_info_insert_adata_layer, main_info_insert_adata_obs, main_info_insert_adata_obsm, main_info_insert_adata_uns, - main_info_insert_adata_var, main_log_time, main_warning, ) @@ -46,8 +46,8 @@ get_svr_filter, get_sz_exprs, merge_adata_attrs, - normalize_mat_monocle, pca, + size_factor_normalize, sz_util, unique_var_obs_adata, ) @@ -269,7 +269,64 @@ def log1p_adata_layer(adata: AnnData, layer: str = DKM.X_LAYER, copy: bool = Fal return _adata -def log1p_adata(adata: AnnData, layers: list = [DKM.X_LAYER], copy: bool = False) -> AnnData: +def log2_adata_layer(adata: AnnData, layer: str = DKM.X_LAYER, copy: bool = False) -> AnnData: + """Calculate log2 of adata's specific layer. + + Args: + adata: an AnnData object. + layer: the layer to operate on. Defaults to DKM.X_LAYER. + copy: whether operate on the original object or on a copied one and return it. Defaults to False. + + Returns: + The updated AnnData object. + """ + + _adata = adata + if copy: + _adata = copy_adata(adata) + log2_inplace(_adata, layer=layer) + return _adata + + +def log_adata_layer(adata: AnnData, layer: str = DKM.X_LAYER, copy: bool = False) -> AnnData: + """Calculate log of adata's specific layer. + + Args: + adata: an AnnData object. + layer: the layer to operate on. Defaults to DKM.X_LAYER. + copy: whether operate on the original object or on a copied one and return it. Defaults to False. + + Returns: + The updated AnnData object. + """ + + _adata = adata + if copy: + _adata = copy_adata(adata) + log_inplace(_adata, layer=layer) + return _adata + + +def Freeman_Tukey_adata_layer(adata: AnnData, layer: str = DKM.X_LAYER, copy: bool = False) -> AnnData: + """Calculate Freeman_Tukey of adata's specific layer. + + Args: + adata: an AnnData object. + layer: the layer to operate on. Defaults to DKM.X_LAYER. + copy: whether operate on the original object or on a copied one and return it. Defaults to False. + + Returns: + The updated AnnData object. + """ + + _adata = adata + if copy: + _adata = copy_adata(adata) + Freeman_Tukey_inplace(_adata, layer=layer) + return _adata + + +def log1p(adata: AnnData, layers: list = [DKM.X_LAYER], copy: bool = False) -> AnnData: """Perform log1p transform on selected adata layers Args: @@ -285,9 +342,87 @@ def log1p_adata(adata: AnnData, layers: list = [DKM.X_LAYER], copy: bool = False if copy: _adata = copy_adata(adata) - main_info("log1p transform applied to layers: %s" % (str(layers))) + main_debug("[log1p] transform applied to layers: %s" % (str(layers))) for layer in layers: log1p_adata_layer(_adata, layer=layer) + + main_info_insert_adata_uns("pp.norm_method") + adata.uns["pp"]["norm_method"] = "log1p" + return _adata + + +def log2(adata: AnnData, layers: list = [DKM.X_LAYER], copy: bool = False) -> AnnData: + """Perform log2 transform on selected adata layers + + Args: + adata: an AnnData object. + layers: the layers to operate on. Defaults to [DKM.X_LAYER]. + copy: whether operate on the original object or on a copied one and return it. Defaults to False. + + Returns: + The updated AnnData object. + """ + + _adata = adata + if copy: + _adata = copy_adata(adata) + + main_debug("[log2] transform applied to layers: %s" % (str(layers))) + for layer in layers: + log2_adata_layer(_adata, layer=layer) + + main_info_insert_adata_uns("pp.norm_method") + adata.uns["pp"]["norm_method"] = "log2" + return _adata + + +def log(adata: AnnData, layers: list = [DKM.X_LAYER], copy: bool = False) -> AnnData: + """Perform log transform on selected adata layers + + Args: + adata: an AnnData object. + layers: the layers to operate on. Defaults to [DKM.X_LAYER]. + copy: whether operate on the original object or on a copied one and return it. Defaults to False. + + Returns: + The updated AnnData object. + """ + + _adata = adata + if copy: + _adata = copy_adata(adata) + + main_debug("[log] transform applied to layers: %s" % (str(layers))) + for layer in layers: + log_adata_layer(_adata, layer=layer) + + main_info_insert_adata_uns("pp.norm_method") + adata.uns["pp"]["norm_method"] = "log" + return _adata + + +def Freeman_Tukey(adata: AnnData, layers: list = [DKM.X_LAYER], copy: bool = False) -> AnnData: + """Perform Freeman_Tukey transform on selected adata layers + + Args: + adata: an AnnData object. + layers: the layers to operate on. Defaults to [DKM.X_LAYER]. + copy: whether operate on the original object or on a copied one and return it. Defaults to False. + + Returns: + The updated AnnData object. + """ + + _adata = adata + if copy: + _adata = copy_adata(adata) + + main_debug("[Freeman_Tukey] transform applied to layers: %s" % (str(layers))) + for layer in layers: + Freeman_Tukey_adata_layer(_adata, layer=layer) + + main_info_insert_adata_uns("pp.norm_method") + adata.uns["pp"]["norm_method"] = "Freeman_Tukey" return _adata @@ -304,6 +439,32 @@ def _log1p_inplace(data: np.ndarray) -> np.ndarray: return np.log1p(data, out=data) +def _log2_inplace(data: np.ndarray) -> np.ndarray: + """Calculate Base-2 logarithm of `x` of an array and update the array inplace. + + Args: + data: the array for calculation. + + Returns: + The updated array. + """ + + return np.log2(data, out=data) + + +def _log_inplace(data: np.ndarray) -> np.ndarray: + """Calculate the natural logarithm `log(exp(x)) = x` of an array and update the array inplace. + + Args: + data: the array for calculation. + + Returns: + The updated array. + """ + + return np.log(data, out=data) + + def log1p_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: """Calculate log1p (log(1+x)) for a layer of an AnnData object inplace. @@ -323,6 +484,65 @@ def log1p_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: _log1p_inplace(mat) +def log2_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: + """Calculate log1p (log(1+x)) for a layer of an AnnData object inplace. + + Args: + adata: an AnnData object. + layer: the layer to operate on. Defaults to DKM.X_LAYER. + """ + + mat = DKM.select_layer_data(adata, layer, copy=False) + if issparse(mat): + if is_integer_arr(mat.data): + mat = mat.asfptype() + DKM.set_layer_data(adata, layer, mat) + _log2_inplace(mat.data + 1) + else: + mat = mat.astype(np.float) + _log2_inplace(mat + 1) + + +def log_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: + """Calculate log1p (log(1+x)) for a layer of an AnnData object inplace. + + Args: + adata: an AnnData object. + layer: the layer to operate on. Defaults to DKM.X_LAYER. + """ + + mat = DKM.select_layer_data(adata, layer, copy=False) + if issparse(mat): + if is_integer_arr(mat.data): + mat = mat.asfptype() + DKM.set_layer_data(adata, layer, mat) + _log_inplace(mat.data + 1) + else: + mat = mat.astype(np.float) + _log_inplace(mat + 1) + + +def Freeman_Tukey_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: + """Calculate Freeman-Tukey transform for a layer of an AnnData object inplace. + + Args: + adata: an AnnData object. + layer: the layer to operate on. Defaults to DKM.X_LAYER. + """ + mat = DKM.select_layer_data(adata, layer, copy=False) + if issparse(mat): + if is_integer_arr(mat.data): + mat = mat.asfptype() + DKM.set_layer_data(adata, layer, mat) + Freeman_Tukey(mat.data) + else: + mat = mat.astype(np.float) + Freeman_Tukey(mat) + + mat.data -= 1 + DKM.set_layer_data(adata, layer, mat) + + def filter_genes_by_outliers( adata: anndata.AnnData, filter_bool: Union[np.ndarray, None] = None, @@ -733,20 +953,16 @@ def calc_sz_factor( # TODO refactor the function below -def normalize_cell_expr_by_size_factors( +def normalize( adata: anndata.AnnData, layers: str = "all", total_szfactor: str = "total_Size_Factor", splicing_total_layers: bool = False, X_total_layers: bool = False, - norm_method: Union[Callable, None] = None, - pseudo_expr: int = 1, - relative_expr: bool = True, keep_filtered: bool = True, recalc_sz: bool = False, sz_method: Literal["mean-geometric-mean-total", "geometric", "median"] = "median", scale_to: Union[float, None] = None, - skip_log: bool = False, ) -> anndata.AnnData: """Normalize the gene expression value for the AnnData object. @@ -761,12 +977,6 @@ def normalize_cell_expr_by_size_factors( splicing_total_layers: whether to also normalize spliced / unspliced layers by size factor from total RNA. Defaults to False. X_total_layers: whether to also normalize adata.X by size factor from total RNA. Defaults to False. - norm_method: the method used to normalize data. Can be either function `np.log1p`, `np.log2` or any other - functions or string `clr`. By default, only .X will be size normalized and log1p transformed while data in - other layers will only be size normalized. Defaults to None. - pseudo_expr: a pseudocount added to the gene expression value before log/log2 normalization. Defaults to 1. - relative_expr: whether we need to divide gene expression values first by size factor before normalization. - Defaults to True. keep_filtered: whether we will only store feature genes in the adata object. If it is False, size factor will be recalculated only for the selected feature genes. Defaults to True. recalc_sz: whether we need to recalculate size factor based on selected genes before normalization. Defaults to @@ -775,7 +985,6 @@ def normalize_cell_expr_by_size_factors( `mean-geometric-mean-total` / `geometric` and `median` are supported. When `median` is used, `locfunc` will be replaced with `np.nanmedian`. Defaults to "median". scale_to: the final total expression for each cell that will be scaled to. Defaults to None. - skip_log: whether skip log transformation. Defaults to False. Returns: An updated anndata object that are updated with normalized expression values for different layers. @@ -791,7 +1000,6 @@ def normalize_cell_expr_by_size_factors( layer_sz_column_names = [i + "_Size_Factor" for i in set(layers).difference("X")] layer_sz_column_names.extend(["Size_Factor"]) - # layers_to_sz = list(set(layer_sz_column_names).difference(adata.obs.keys())) layers_to_sz = list(set(layer_sz_column_names)) if not all(key in adata.obs.keys() for key in layers_to_sz): @@ -820,27 +1028,7 @@ def normalize_cell_expr_by_size_factors( else: szfactors, CM = get_sz_exprs(adata, layer, total_szfactor=total_szfactor) - # log transforms - - # special default norm case for adata.X in monocle logics - if norm_method is None and layer == "X": - _norm_method = np.log1p - else: - _norm_method = norm_method - - if skip_log: - main_debug("skipping log transformation as input requires...") - _norm_method = None - - if _norm_method in [np.log1p, np.log, np.log2, Freeman_Tukey, None] and layer != "protein": - main_debug("applying %s to layer<%s>" % (_norm_method, layer)) - CM = normalize_mat_monocle(CM, szfactors, relative_expr, pseudo_expr, _norm_method) - - elif layer == "protein": # norm_method == 'clr': - if _norm_method != "clr": - main_warning( - "For protein data, log transformation is not recommended. Using clr normalization by default." - ) + if layer == "protein": """This normalization implements the centered log-ratio (CLR) normalization from Seurat which is computed for each gene (M Stoeckius, 2017). """ @@ -857,21 +1045,18 @@ def normalize_cell_expr_by_size_factors( CM = CM.T else: - main_warning(_norm_method + " is not implemented yet") + CM = size_factor_normalize(CM, szfactors) if layer in ["raw", "X"]: - main_debug("set adata to normalized data using %s" % _norm_method) + main_debug("set adata to normalized data.") adata.X = CM elif layer == "protein" and "protein" in adata.obsm_keys(): main_info_insert_adata_obsm("X_protein") adata.obsm["X_protein"] = CM else: - main_info_insert_adata_obsm("X_" + layer) + main_info_insert_adata_layer("X_" + layer) adata.layers["X_" + layer] = CM - main_info_insert_adata_uns("pp.norm_method") - adata.uns["pp"]["norm_method"] = _norm_method.__name__ if callable(_norm_method) else _norm_method - return adata diff --git a/dynamo/preprocessing/utils.py b/dynamo/preprocessing/utils.py index 5695e0794..a72314524 100755 --- a/dynamo/preprocessing/utils.py +++ b/dynamo/preprocessing/utils.py @@ -717,6 +717,19 @@ def normalize_mat_monocle( return mat +def size_factor_normalize(mat: np.ndarray, szfactors: np.ndarray) -> np.ndarray: + """perform size factor normalization on the given array. + + Args: + mat: the array to operate on. + szfactors: the size factors corresponding to the array. + + Returns: + The normalized array divided by size factor + """ + return mat.multiply(csr_matrix(1 / szfactors)) if issparse(mat) else mat / szfactors + + def Freeman_Tukey(X: np.ndarray, inverse=False) -> np.ndarray: """perform Freeman-Tukey transform or inverse transform on the given array. diff --git a/dynamo/tools/clustering.py b/dynamo/tools/clustering.py index 9a2237f69..0a1aa821e 100644 --- a/dynamo/tools/clustering.py +++ b/dynamo/tools/clustering.py @@ -13,8 +13,7 @@ from ..configuration import DKM from ..dynamo_logger import main_info from ..preprocessing.preprocessor_utils import filter_genes_by_outliers as filter_genes -from ..preprocessing.preprocessor_utils import log1p_adata as log1p -from ..preprocessing.preprocessor_utils import normalize_cell_expr_by_size_factors +from ..preprocessing.preprocessor_utils import log1p, normalize from ..preprocessing.utils import pca from ..utils import LoggerManager, copy_adata from .connectivity import _gen_neighbor_keys, neighbors @@ -559,7 +558,7 @@ def cluster_community_from_graph( ) -> Any: """A function takes a graph as input and clusters its nodes into communities using one of three algorithms: Leiden, Louvain, or Infomap. - + Args: graph (nx.Graph): the input graph that would be directly used for clustering. Defaults to None. graph_sparse_matrix: a sparse matrix that would be converted to a graph if `graph` is not supplied. @@ -673,7 +672,7 @@ def scc( filter_genes(adata, min_cell_s=min_cells) adata.uns["pp"] = {} - normalize_cell_expr_by_size_factors(adata, layers="X") + normalize(adata, layers="X") log1p(adata) pca(adata, n_pca_components=30, pca_key="X_pca") diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py index 25ff78a58..bc5e38933 100644 --- a/tests/test_preprocess.py +++ b/tests/test_preprocess.py @@ -19,7 +19,7 @@ is_log1p_transformed_adata, is_nonnegative, is_nonnegative_integer_arr, - log1p_adata, + log1p, ) from dynamo.preprocessing.utils import convert_layers2csr @@ -145,7 +145,7 @@ def test_Preprocessor_simple_run(adata): def test_is_log_transformed(): adata = dyn.sample_data.zebrafish() assert not is_log1p_transformed_adata(adata) - log1p_adata(adata) + log1p(adata) assert is_log1p_transformed_adata(adata) From 0cfdfe7869c47ee04c4937aa5de5e32667c26da5 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Tue, 25 Apr 2023 18:17:16 -0400 Subject: [PATCH 22/28] fix for updating value in log functions and naming _Freeman_Tukey --- dynamo/preprocessing/Preprocessor.py | 8 ++++++-- dynamo/preprocessing/__init__.py | 4 ++-- dynamo/preprocessing/gene_selection.py | 4 ++-- dynamo/preprocessing/preprocess.py | 12 ++++++------ dynamo/preprocessing/preprocessor_utils.py | 22 +++++++++++----------- dynamo/preprocessing/utils.py | 2 +- dynamo/tools/markers.py | 6 +++--- dynamo/tools/utils.py | 6 +++--- 8 files changed, 34 insertions(+), 30 deletions(-) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index b48f69a17..e54c6adbb 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -25,14 +25,18 @@ from .cell_cycle import cell_cycle_scores from .gene_selection import select_genes_by_seurat_recipe, select_genes_monocle from .preprocess import pca -from .preprocessor_utils import _infer_labeling_experiment_type, calc_sz_factor +from .preprocessor_utils import ( + Freeman_Tukey, + _infer_labeling_experiment_type, + calc_sz_factor, +) from .preprocessor_utils import ( filter_cells_by_outliers as monocle_filter_cells_by_outliers, ) from .preprocessor_utils import ( filter_genes_by_outliers as monocle_filter_genes_by_outliers, ) -from .preprocessor_utils import is_log1p_transformed_adata, log1p, normalize +from .preprocessor_utils import log, log1p, log2, normalize from .utils import ( collapse_species_adata, convert2symbol, diff --git a/dynamo/preprocessing/__init__.py b/dynamo/preprocessing/__init__.py index ec0e14f93..caef0b092 100755 --- a/dynamo/preprocessing/__init__.py +++ b/dynamo/preprocessing/__init__.py @@ -35,7 +35,7 @@ normalize_cells = normalize from .CnmfPreprocessor import CnmfPreprocessor -from .gene_selection import Gini, select_genes_by_svr, select_genes_monocle +from .gene_selection import Gini, _select_genes_by_svr, select_genes_monocle from .Preprocessor import Preprocessor __all__ = [ @@ -54,7 +54,7 @@ "filter_genes", "filter_genes_by_outliers", "filter_genes_by_clusters_", - "select_genes_by_svr", + "_select_genes_by_svr", "get_svr_filter", "highest_frac_genes", "cell_cycle_scores", diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 7168fd304..509f1d107 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -140,7 +140,7 @@ def select_genes_monocle( filter_bool = filter_bool.index.isin(feature_gene_idx) elif sort_by == "cv_dispersion" or sort_by == "fano_dispersion": if not any("velocyto_SVR" in key for key in adata.uns.keys()): - select_genes_by_svr( + _select_genes_by_svr( adata, layers=layer, filter_bool=filter_bool, @@ -173,7 +173,7 @@ def select_genes_monocle( adata.uns["feature_selection"] = sort_by -def select_genes_by_svr( +def _select_genes_by_svr( adata_ori: AnnData, filter_bool: Union[np.ndarray, None] = None, layers: str = "X", diff --git a/dynamo/preprocessing/preprocess.py b/dynamo/preprocessing/preprocess.py index 595215371..6a8c4d752 100755 --- a/dynamo/preprocessing/preprocess.py +++ b/dynamo/preprocessing/preprocess.py @@ -28,7 +28,7 @@ from ..utils import copy_adata from ._deprecated import _top_table from .cell_cycle import cell_cycle_scores -from .gene_selection import select_genes_by_svr +from .gene_selection import _select_genes_by_svr from .preprocessor_utils import ( _infer_labeling_experiment_type, filter_cells_by_outliers, @@ -36,7 +36,7 @@ normalize, ) from .utils import ( - Freeman_Tukey, + _Freeman_Tukey, add_noise_to_duplicates, basic_stats, calc_new_to_total_ratio, @@ -270,7 +270,7 @@ def normalize_cell_expr_by_size_factors_legacy( if norm_method is None and layer == "X": CM = normalize_mat_monocle(CM, szfactors, relative_expr, pseudo_expr, np.log1p) - elif norm_method in [np.log1p, np.log, np.log2, Freeman_Tukey, None] and layer != "protein": + elif norm_method in [np.log1p, np.log, np.log2, _Freeman_Tukey, None] and layer != "protein": CM = normalize_mat_monocle(CM, szfactors, relative_expr, pseudo_expr, norm_method) elif layer == "protein": # norm_method == 'clr': if norm_method != "clr": @@ -753,7 +753,7 @@ def recipe_monocle( # we should create all following data after convert2symbol (gene names) adata.uns["pp"] = {} if norm_method == "Freeman_Tukey": - norm_method = Freeman_Tukey + norm_method = _Freeman_Tukey basic_stats(adata) ( @@ -1258,7 +1258,7 @@ def recipe_velocyto( adata = adata[:, filter_bool] - adata = select_genes_by_svr( + adata = _select_genes_by_svr( adata, layers=["spliced"], min_expr_cells=2, @@ -1501,7 +1501,7 @@ def select_genes_monocle_legacy( "sort_inverse": False, } SVRs_args = update_dict(SVRs_args, SVRs_kwargs) - adata = select_genes_by_svr( + adata = _select_genes_by_svr( adata, layers=[layer], total_szfactor=total_szfactor, diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index ae9ed3d86..be2d192d0 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -31,7 +31,7 @@ from ..tools.utils import update_dict from ..utils import copy_adata from .utils import ( - Freeman_Tukey, + _Freeman_Tukey, add_noise_to_duplicates, basic_stats, calc_new_to_total_ratio, @@ -449,7 +449,7 @@ def _log2_inplace(data: np.ndarray) -> np.ndarray: The updated array. """ - return np.log2(data, out=data) + return np.log2(data + 1, out=data) def _log_inplace(data: np.ndarray) -> np.ndarray: @@ -462,7 +462,7 @@ def _log_inplace(data: np.ndarray) -> np.ndarray: The updated array. """ - return np.log(data, out=data) + return np.log(data + 1, out=data) def log1p_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: @@ -485,7 +485,7 @@ def log1p_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: def log2_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: - """Calculate log1p (log(1+x)) for a layer of an AnnData object inplace. + """Calculate Base-2 logarithm of `x` for a layer of an AnnData object inplace. Args: adata: an AnnData object. @@ -497,14 +497,14 @@ def log2_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: if is_integer_arr(mat.data): mat = mat.asfptype() DKM.set_layer_data(adata, layer, mat) - _log2_inplace(mat.data + 1) + _log2_inplace(mat.data) else: mat = mat.astype(np.float) - _log2_inplace(mat + 1) + _log2_inplace(mat) def log_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: - """Calculate log1p (log(1+x)) for a layer of an AnnData object inplace. + """Calculate the natural logarithm `log(exp(x)) = x` for a layer of an AnnData object inplace. Args: adata: an AnnData object. @@ -516,10 +516,10 @@ def log_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: if is_integer_arr(mat.data): mat = mat.asfptype() DKM.set_layer_data(adata, layer, mat) - _log_inplace(mat.data + 1) + _log_inplace(mat.data) else: mat = mat.astype(np.float) - _log_inplace(mat + 1) + _log_inplace(mat) def Freeman_Tukey_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: @@ -534,10 +534,10 @@ def Freeman_Tukey_inplace(adata: AnnData, layer: str = DKM.X_LAYER) -> None: if is_integer_arr(mat.data): mat = mat.asfptype() DKM.set_layer_data(adata, layer, mat) - Freeman_Tukey(mat.data) + _Freeman_Tukey(mat.data) else: mat = mat.astype(np.float) - Freeman_Tukey(mat) + _Freeman_Tukey(mat) mat.data -= 1 DKM.set_layer_data(adata, layer, mat) diff --git a/dynamo/preprocessing/utils.py b/dynamo/preprocessing/utils.py index a72314524..8ae7bac9b 100755 --- a/dynamo/preprocessing/utils.py +++ b/dynamo/preprocessing/utils.py @@ -730,7 +730,7 @@ def size_factor_normalize(mat: np.ndarray, szfactors: np.ndarray) -> np.ndarray: return mat.multiply(csr_matrix(1 / szfactors)) if issparse(mat) else mat / szfactors -def Freeman_Tukey(X: np.ndarray, inverse=False) -> np.ndarray: +def _Freeman_Tukey(X: np.ndarray, inverse=False) -> np.ndarray: """perform Freeman-Tukey transform or inverse transform on the given array. Args: diff --git a/dynamo/tools/markers.py b/dynamo/tools/markers.py index aae1f87c4..8f959c6f8 100755 --- a/dynamo/tools/markers.py +++ b/dynamo/tools/markers.py @@ -29,7 +29,7 @@ main_tqdm, main_warning, ) -from ..preprocessing.utils import Freeman_Tukey +from ..preprocessing.utils import _Freeman_Tukey from ..tools.connectivity import _gen_neighbor_keys, check_and_recompute_neighbors from .utils import fetch_X_data from .utils_markers import fdr, specificity @@ -674,7 +674,7 @@ def glm_degs( if adata.uns["pp"]["norm_method"] == "log2" else np.exp(X_data.data) - 1 if adata.uns["pp"]["norm_method"] == "log" - else Freeman_Tukey(X_data.data + 1, inverse=True) + else _Freeman_Tukey(X_data.data + 1, inverse=True) if adata.uns["pp"]["norm_method"] == "Freeman_Tukey" else X_data.data ) @@ -684,7 +684,7 @@ def glm_degs( if adata.uns["pp"]["norm_method"] == "log2" else np.exp(X_data) - 1 if adata.uns["pp"]["norm_method"] == "log" - else Freeman_Tukey(X_data, inverse=True) + else _Freeman_Tukey(X_data, inverse=True) if adata.uns["pp"]["norm_method"] == "Freeman_Tukey" else X_data ) diff --git a/dynamo/tools/utils.py b/dynamo/tools/utils.py index c03d5161b..2f609e79b 100755 --- a/dynamo/tools/utils.py +++ b/dynamo/tools/utils.py @@ -35,7 +35,7 @@ main_tqdm, main_warning, ) -from ..preprocessing.utils import Freeman_Tukey +from ..preprocessing.utils import _Freeman_Tukey from ..utils import areinstance, isarray @@ -999,7 +999,7 @@ def inverse_norm(adata: AnnData, layer_x: Union[np.ndarray, sp.csr_matrix]) -> n if adata.uns["pp"]["norm_method"] == "log2" else np.exp(layer_x.data) - 1 if adata.uns["pp"]["norm_method"] == "log" - else Freeman_Tukey(layer_x.data + 1, inverse=True) + else _Freeman_Tukey(layer_x.data + 1, inverse=True) if adata.uns["pp"]["norm_method"] == "Freeman_Tukey" else layer_x.data ) @@ -1011,7 +1011,7 @@ def inverse_norm(adata: AnnData, layer_x: Union[np.ndarray, sp.csr_matrix]) -> n if adata.uns["pp"]["norm_method"] == "log2" else np.exp(layer_x) - 1 if adata.uns["pp"]["norm_method"] == "log" - else Freeman_Tukey(layer_x, inverse=True) + else _Freeman_Tukey(layer_x, inverse=True) if adata.uns["pp"]["norm_method"] == "Freeman_Tukey" else layer_x ) From c36fe0385f52746bbf10e7a732c1cfc58660da81 Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Wed, 26 Apr 2023 15:27:32 -0400 Subject: [PATCH 23/28] fix to save the results of fano and to basic filtering for all recipes --- dynamo/preprocessing/Preprocessor.py | 8 +++++++- dynamo/preprocessing/gene_selection.py | 15 +++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index e54c6adbb..a20bbffa6 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -63,7 +63,7 @@ def __init__( select_genes_kwargs: dict = {}, normalize_selected_genes_function: Callable = None, normalize_selected_genes_kwargs: dict = {}, - norm_method: bool = True, + norm_method: Callable = log1p, norm_method_kwargs: dict = {}, pca_function: Callable = pca, pca_kwargs: dict = {}, @@ -566,7 +566,9 @@ def preprocess_adata_seurat( main_info("Running Seurat recipe preprocessing...") self.standardize_adata(adata, tkey, experiment_type) + self._filter_cells_by_outliers(adata) self._filter_genes_by_outliers(adata) + self._calc_size_factor(adata) self._normalize_by_cells(adata) self._select_genes(adata) @@ -679,6 +681,8 @@ def preprocess_adata_pearson_residuals( temp_logger = LoggerManager.gen_logger("Preprocessor-pearson residual") temp_logger.log_time() self.standardize_adata(adata, tkey, experiment_type) + self._filter_cells_by_outliers(adata) + self._filter_genes_by_outliers(adata) self._select_genes(adata) # append/delete/force selected gene list required by users. @@ -737,6 +741,8 @@ def preprocess_adata_monocle_pearson_residuals( temp_logger = LoggerManager.gen_logger("preprocessor-monocle-pearson-residual") temp_logger.log_time() self.standardize_adata(adata, tkey, experiment_type) + self._filter_cells_by_outliers(adata) + self._filter_genes_by_outliers(adata) self._select_genes(adata) # append/delete/force selected gene list required by users. diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 509f1d107..659b1fbf5 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -230,7 +230,7 @@ def _select_genes_by_svr( if valid_CM is None: continue - mean, cv = get_mean_cv(valid_CM, algorithm, winsorize, winsor_perc) + mean, cv = get_mean_cv(adata, valid_CM, algorithm, winsorize, winsor_perc) fitted_fun, svr_gamma = get_prediction_by_svr(mean, cv, svr_gamma) score = cv - fitted_fun(mean) if sort_inverse: @@ -339,6 +339,7 @@ def get_vaild_CM( def get_mean_cv( + adata: AnnData, valid_CM: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix, scipy.sparse.coo_matrix], algorithm: Literal["cv_dispersion", "fano_dispersion"] = "cv_dispersion", winsorize: bool = False, @@ -347,6 +348,7 @@ def get_mean_cv( """Find the mean and coefficient of variation of gene expression. Args: + adata: an AnnData object algorithm: Method of calculating mean and coefficient of variation, either fano_dispersion or cv_dispersion. valid_CM: Gene expression matrix to be used in a downstream analysis. winsorize: Whether to winsorize the data for the cv vs mean model. Defaults to False. @@ -358,7 +360,7 @@ def get_mean_cv( """ if algorithm == "fano_dispersion": - (gene_counts_stats, gene_fano_parameters) = get_highvar_genes_sparse(valid_CM) + (gene_counts_stats, gene_fano_parameters) = get_highvar_genes_sparse(adata, valid_CM) mean = np.array(gene_counts_stats["mean"]).flatten()[:, None] cv = np.array(gene_counts_stats["fano"]).flatten() return mean, cv @@ -424,6 +426,7 @@ def get_prediction_by_svr(ground: np.ndarray, target: np.ndarray, svr_gamma: Opt # Highly variable gene selection function: def get_highvar_genes_sparse( + adata: AnnData, expression: Union[ np.ndarray, scipy.sparse.csr_matrix, @@ -433,15 +436,18 @@ def get_highvar_genes_sparse( expected_fano_threshold: Optional[float] = None, numgenes: Optional[int] = None, minimal_mean: float = 0.5, + save_key: Optional[str] = None, ) -> Tuple[pd.DataFrame, Dict]: """Find highly-variable genes in sparse single-cell data matrices. Args: + adata: an AnnData object expression: Gene expression matrix expected_fano_threshold: Optionally can be used to set a manual dispersion threshold (for definition of "highly-variable") numgenes: Optionally can be used to find the n most variable genes minimal_mean: Sets a threshold on the minimum mean expression to consider + save_key: the key to store the fano calculation results Returns: gene_counts_stats: Results dataframe containing pertinent information for each gene @@ -501,6 +507,11 @@ def get_highvar_genes_sparse( "T": T, "minimal_mean": minimal_mean, } + + if save_key is not None: + LoggerManager.main_logger.info_insert_adata(save_key, "varm") + gene_counts_stats.set_index(adata.var.index, inplace=True) + adata.varm[save_key] = gene_counts_stats return gene_counts_stats, gene_fano_parameters From 839969f72a22fb29298ac94dc156da8fe169c4ee Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 28 Apr 2023 15:07:22 -0400 Subject: [PATCH 24/28] modularize Gini and dispersion --- dynamo/preprocessing/__init__.py | 6 ++-- dynamo/preprocessing/gene_selection.py | 41 +++++++++++++++++++------- dynamo/preprocessing/preprocess.py | 6 ++-- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/dynamo/preprocessing/__init__.py b/dynamo/preprocessing/__init__.py index caef0b092..4a8209818 100755 --- a/dynamo/preprocessing/__init__.py +++ b/dynamo/preprocessing/__init__.py @@ -35,7 +35,7 @@ normalize_cells = normalize from .CnmfPreprocessor import CnmfPreprocessor -from .gene_selection import Gini, _select_genes_by_svr, select_genes_monocle +from .gene_selection import calc_Gini, calc_dispersion_by_svr, select_genes_monocle from .Preprocessor import Preprocessor __all__ = [ @@ -48,13 +48,13 @@ "normalize", "recipe_monocle", "recipe_velocyto", - "Gini", + "calc_Gini", "filter_cells_by_outliers", "select_genes_monocle", "filter_genes", "filter_genes_by_outliers", "filter_genes_by_clusters_", - "_select_genes_by_svr", + "calc_dispersion_by_svr", "get_svr_filter", "highest_frac_genes", "cell_cycle_scores", diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 659b1fbf5..d815f9700 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -30,7 +30,7 @@ from .utils import compute_gene_exp_fraction, merge_adata_attrs -def Gini(adata: AnnData, layers: Union[Literal["all"], List[str]] = "all") -> AnnData: +def calc_Gini(adata: AnnData, layers: Union[Literal["all"], List[str]] = "all") -> AnnData: """Calculate the Gini coefficient of a numpy array. https://github.com/thomasmaxwellnorman/perturbseq_demo/blob/master/perturbseq/util.py @@ -49,7 +49,7 @@ def Gini(adata: AnnData, layers: Union[Literal["all"], List[str]] = "all") -> An layers = DKM.get_available_layer_keys(adata, layers) - def compute_gini(CM): + def _compute_gini(CM): # convert to dense array if sparse if issparse(CM): CM = CM.A @@ -85,12 +85,36 @@ def compute_gini(CM): else: CM = adata.layers[layer] - var_gini = compute_gini(CM) + var_gini = _compute_gini(CM) adata.var[layer + "_gini"] = var_gini return adata +def get_Gini_filter( + adata: AnnData, + layer: str = DKM.X_LAYER, + n_top_genes: int = 2000, + basic_filter: Union[np.ndarray, pd.Series] = None, +) -> np.ndarray: + """Generate the mask showing the genes with Gini coefficient. + + Args: + adata: an AnnData object. + layer: the layer to operate on. Defaults to "spliced". + n_top_genes: number of top genes to be filtered. Defaults to 3000. + basic_filter: the filter to remove outliers. Should be `adata.var["pass_basic_filter"]` in most cases. + + Returns: + The filter mask as a bool array. + """ + valid_table = adata.var[layer + "_gini"][basic_filter] + feature_gene_idx = np.argsort(-valid_table)[:n_top_genes] + feature_gene_idx = valid_table.index[feature_gene_idx] + filter_bool = basic_filter.index.isin(feature_gene_idx) + return filter_bool + + def select_genes_monocle( adata: AnnData, layer: str = DKM.X_LAYER, @@ -133,14 +157,11 @@ def select_genes_monocle( else: if sort_by == "gini": if layer + "_gini" is not adata.var.keys(): - Gini(adata) - valid_table = adata.var[layer + "_gini"][filter_bool] - feature_gene_idx = np.argsort(-valid_table)[:n_top_genes] - feature_gene_idx = valid_table.index[feature_gene_idx] - filter_bool = filter_bool.index.isin(feature_gene_idx) + calc_Gini(adata) + filter_bool = get_Gini_filter(adata, layer=layer, n_top_genes=n_top_genes, basic_filter=filter_bool) elif sort_by == "cv_dispersion" or sort_by == "fano_dispersion": if not any("velocyto_SVR" in key for key in adata.uns.keys()): - _select_genes_by_svr( + calc_dispersion_by_svr( adata, layers=layer, filter_bool=filter_bool, @@ -173,7 +194,7 @@ def select_genes_monocle( adata.uns["feature_selection"] = sort_by -def _select_genes_by_svr( +def calc_dispersion_by_svr( adata_ori: AnnData, filter_bool: Union[np.ndarray, None] = None, layers: str = "X", diff --git a/dynamo/preprocessing/preprocess.py b/dynamo/preprocessing/preprocess.py index 6a8c4d752..729b9c6a4 100755 --- a/dynamo/preprocessing/preprocess.py +++ b/dynamo/preprocessing/preprocess.py @@ -28,7 +28,7 @@ from ..utils import copy_adata from ._deprecated import _top_table from .cell_cycle import cell_cycle_scores -from .gene_selection import _select_genes_by_svr +from .gene_selection import calc_dispersion_by_svr from .preprocessor_utils import ( _infer_labeling_experiment_type, filter_cells_by_outliers, @@ -1258,7 +1258,7 @@ def recipe_velocyto( adata = adata[:, filter_bool] - adata = _select_genes_by_svr( + adata = calc_dispersion_by_svr( adata, layers=["spliced"], min_expr_cells=2, @@ -1501,7 +1501,7 @@ def select_genes_monocle_legacy( "sort_inverse": False, } SVRs_args = update_dict(SVRs_args, SVRs_kwargs) - adata = _select_genes_by_svr( + adata = calc_dispersion_by_svr( adata, layers=[layer], total_szfactor=total_szfactor, From 3558bda14ab5ef7311b59dc41bbef2beb8b1a1c3 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 28 Apr 2023 18:47:21 -0400 Subject: [PATCH 25/28] update sctransform gene selection --- dynamo/external/sctransform.py | 7 +++++++ dynamo/preprocessing/Preprocessor.py | 4 ++-- dynamo/preprocessing/utils.py | 22 ++++++++++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/dynamo/external/sctransform.py b/dynamo/external/sctransform.py index 6c7278c86..263e454dd 100644 --- a/dynamo/external/sctransform.py +++ b/dynamo/external/sctransform.py @@ -21,6 +21,7 @@ from ..configuration import DKM from ..dynamo_logger import main_info, main_info_insert_adata_layer +from ..preprocessing.utils import get_gene_selection_filter _EPS = np.finfo(float).eps @@ -331,3 +332,9 @@ def sctransform(adata: AnnData, layers: str = [DKM.X_LAYER], output_layer: str = """a wrapper calls sctransform_core and set dynamo style keys in adata""" for layer in layers: sctransform_core(adata, layer=layer, n_genes=n_top_genes, **kwargs) + if adata.X.shape[1] > n_top_genes: + X_squared = adata.X.copy() + X_squared.data **= 2 + variance = X_squared.mean(0) - np.square(adata.X.mean(0)) + adata.var["sct_score"] = variance.A1 + adata.var["use_for_pca"] = get_gene_selection_filter(adata.var["sct_score"], n_top_genes=n_top_genes) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index a20bbffa6..8834c9c6f 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -599,8 +599,6 @@ def config_sctransform_recipe(self, adata: AnnData) -> None: "min_cell_u": 5, "min_count_u": 1, } - self.select_genes = select_genes_by_seurat_recipe - self.select_genes_kwargs = {"inplace": True} self.sctransform_kwargs = {"layers": raw_layers, "n_top_genes": 2000} self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50} @@ -628,6 +626,8 @@ def preprocess_adata_sctransform( self._filter_cells_by_outliers(adata) self._filter_genes_by_outliers(adata) + main_warning("Sctransform recipe will subset the data first with default gene selection function for " + "efficiency. If you want to disable this, please perform sctransform without recipe.") self._select_genes(adata) # TODO: if inplace in select_genes is True, the following subset is unnecessary. adata._inplace_subset_var(adata.var["use_for_pca"]) diff --git a/dynamo/preprocessing/utils.py b/dynamo/preprocessing/utils.py index 8ae7bac9b..51249954e 100755 --- a/dynamo/preprocessing/utils.py +++ b/dynamo/preprocessing/utils.py @@ -528,6 +528,28 @@ def clusters_stats( return U_avgs, S_avgs +def get_gene_selection_filter( + valid_table: pd.Series, + n_top_genes: int = 2000, + basic_filter: Optional[pd.Series] = None, +) -> np.ndarray: + """Generate the mask by sorting given table of scores. + + Args: + valid_table: the scores used to sort the highly variable genes. + n_top_genes: number of top genes to be filtered. Defaults to 2000. + basic_filter: the filter to remove outliers. For example, the `adata.var["pass_basic_filter"]`. + + Returns: + The filter mask as a bool array. + """ + if basic_filter is None: + basic_filter = pd.Series(True, index=valid_table.index) + feature_gene_idx = np.argsort(-valid_table)[:n_top_genes] + feature_gene_idx = valid_table.index[feature_gene_idx] + return basic_filter.index.isin(feature_gene_idx) + + def get_svr_filter( adata: anndata.AnnData, layer: str = "spliced", n_top_genes: int = 3000, return_adata: bool = False ) -> Union[anndata.AnnData, np.ndarray]: From e6e0aaec61d4584b08c815be122127dfd36779d4 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 28 Apr 2023 18:54:37 -0400 Subject: [PATCH 26/28] update helper function usage when getting gini filter --- dynamo/preprocessing/gene_selection.py | 29 +++----------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index d815f9700..fd26e653e 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -27,7 +27,7 @@ get_svr_filter, seurat_get_mean_var, ) -from .utils import compute_gene_exp_fraction, merge_adata_attrs +from .utils import compute_gene_exp_fraction, get_gene_selection_filter, merge_adata_attrs def calc_Gini(adata: AnnData, layers: Union[Literal["all"], List[str]] = "all") -> AnnData: @@ -91,30 +91,6 @@ def _compute_gini(CM): return adata -def get_Gini_filter( - adata: AnnData, - layer: str = DKM.X_LAYER, - n_top_genes: int = 2000, - basic_filter: Union[np.ndarray, pd.Series] = None, -) -> np.ndarray: - """Generate the mask showing the genes with Gini coefficient. - - Args: - adata: an AnnData object. - layer: the layer to operate on. Defaults to "spliced". - n_top_genes: number of top genes to be filtered. Defaults to 3000. - basic_filter: the filter to remove outliers. Should be `adata.var["pass_basic_filter"]` in most cases. - - Returns: - The filter mask as a bool array. - """ - valid_table = adata.var[layer + "_gini"][basic_filter] - feature_gene_idx = np.argsort(-valid_table)[:n_top_genes] - feature_gene_idx = valid_table.index[feature_gene_idx] - filter_bool = basic_filter.index.isin(feature_gene_idx) - return filter_bool - - def select_genes_monocle( adata: AnnData, layer: str = DKM.X_LAYER, @@ -158,7 +134,8 @@ def select_genes_monocle( if sort_by == "gini": if layer + "_gini" is not adata.var.keys(): calc_Gini(adata) - filter_bool = get_Gini_filter(adata, layer=layer, n_top_genes=n_top_genes, basic_filter=filter_bool) + filter_bool = get_gene_selection_filter( + adata.var[layer + "_gini"][filter_bool], n_top_genes=n_top_genes, basic_filter=filter_bool) elif sort_by == "cv_dispersion" or sort_by == "fano_dispersion": if not any("velocyto_SVR" in key for key in adata.uns.keys()): calc_dispersion_by_svr( From c21fcf1addd9cd9f646c419abfddff9b342a3385 Mon Sep 17 00:00:00 2001 From: sichao Date: Mon, 1 May 2023 10:21:58 -0400 Subject: [PATCH 27/28] increase subset size --- dynamo/preprocessing/Preprocessor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index 8834c9c6f..52fee6edf 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -599,6 +599,7 @@ def config_sctransform_recipe(self, adata: AnnData) -> None: "min_cell_u": 5, "min_count_u": 1, } + self.select_genes_kwargs = {"n_top_genes": 3000} self.sctransform_kwargs = {"layers": raw_layers, "n_top_genes": 2000} self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50} From b37ef81a8ee94fe61eb4896d380169f671c64f1b Mon Sep 17 00:00:00 2001 From: Ukyeon Date: Thu, 4 May 2023 13:14:46 -0400 Subject: [PATCH 28/28] fix build error in test_normalize --- dynamo/preprocessing/Preprocessor.py | 26 +++++++++------------- dynamo/preprocessing/preprocessor_utils.py | 3 --- tests/test_preprocess.py | 21 ++++++++--------- 3 files changed, 19 insertions(+), 31 deletions(-) diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index d25b98e60..252c02c7b 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -21,14 +21,11 @@ sctransform, select_genes_by_pearson_residuals, ) - -from .cell_cycle import cell_cycle_scores -from .gene_selection import select_genes_by_seurat_recipe, select_genes_monocle from ..tools.connectivity import neighbors as default_neighbors from ..tools.utils import update_dict +from .cell_cycle import cell_cycle_scores +from .gene_selection import select_genes_by_seurat_recipe, select_genes_monocle from .preprocess import normalize_cell_expr_by_size_factors_legacy, pca -from .preprocessor_utils import _infer_labeling_experiment_type - from .preprocessor_utils import ( Freeman_Tukey, _infer_labeling_experiment_type, @@ -36,16 +33,11 @@ ) from .preprocessor_utils import ( filter_cells_by_outliers as monocle_filter_cells_by_outliers, - filter_genes_by_outliers as monocle_filter_genes_by_outliers, ) from .preprocessor_utils import ( - is_log1p_transformed_adata, - log1p_adata, - normalize_cell_expr_by_size_factors, - regress_out_parallel, - select_genes_by_dispersion_general, + filter_genes_by_outliers as monocle_filter_genes_by_outliers, ) -from .preprocessor_utils import log, log1p, log2, normalize +from .preprocessor_utils import log, log1p, log2, normalize, regress_out_parallel from .utils import ( basic_stats, collapse_species_adata, @@ -495,7 +487,7 @@ def config_monocle_recipe(self, adata: AnnData, n_top_genes: int = 2000) -> None self.normalize_selected_genes = None self.normalize_by_cells = normalize self.norm_method = log1p - + self.regress_out_kwargs = update_dict({"obs_keys": []}, self.regress_out_kwargs) self.pca = pca @@ -593,7 +585,7 @@ def preprocess_adata_seurat( self._calc_size_factor(adata) self._normalize_by_cells(adata) self._select_genes(adata) - + # append/delete/force selected gene list required by users. self._append_gene_list(adata) self._exclude_gene_list(adata) @@ -651,8 +643,10 @@ def preprocess_adata_sctransform( self._filter_cells_by_outliers(adata) self._filter_genes_by_outliers(adata) - main_warning("Sctransform recipe will subset the data first with default gene selection function for " - "efficiency. If you want to disable this, please perform sctransform without recipe.") + main_warning( + "Sctransform recipe will subset the data first with default gene selection function for " + "efficiency. If you want to disable this, please perform sctransform without recipe." + ) self._select_genes(adata) # TODO: if inplace in select_genes is True, the following subset is unnecessary. adata._inplace_subset_var(adata.var["use_for_pca"]) diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index c9c41da27..99111be7e 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -1056,9 +1056,6 @@ def normalize( main_info_insert_adata_layer("X_" + layer) adata.layers["X_" + layer] = CM - main_info_insert_adata_uns("pp.norm_method") - adata.uns["pp"]["norm_method"] = _norm_method.__name__ if callable(_norm_method) else _norm_method - return adata diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py index 365b7fe58..20d5d17c3 100644 --- a/tests/test_preprocess.py +++ b/tests/test_preprocess.py @@ -3,9 +3,7 @@ import anndata import numpy as np import pandas as pd -import scipy -import scipy.sparse -from scipy.sparse.csr import csr_matrix +from scipy.sparse import csr_matrix from sklearn.decomposition import PCA # from utils import * @@ -19,6 +17,7 @@ is_nonnegative, is_nonnegative_integer_arr, log1p, + normalize, ) from dynamo.preprocessing.utils import convert_layers2csr @@ -160,7 +159,7 @@ def test_layers2csr_matrix(): def test_compute_gene_exp_fraction(): # TODO fix compute_gene_exp_fraction: discuss with Xiaojie # df = pd.DataFrame([[1, 2], [1, 1]]) # input cannot be dataframe - df = scipy.sparse.csr_matrix([[1, 2], [1, 1]]) + df = csr_matrix([[1, 2], [1, 1]]) frac, indices = dyn.preprocessing.compute_gene_exp_fraction(df) print("frac:", list(frac)) assert np.all(np.isclose(frac.flatten(), [2 / 5, 3 / 5])) @@ -282,7 +281,7 @@ def test_gene_selection_method(): print("The preprocess_adata() time difference is :", timeit.default_timer() - starttime) -def test_normalize_cell_expr_by_size_factors(): +def test_normalize(): # Set up test data X = np.array([[1, 2], [3, 4], [5, 6]]) layers = { @@ -303,9 +302,9 @@ def test_normalize_cell_expr_by_size_factors(): adata.uns["pp"] = dict() # Call the function - normalized = normalize_cell_expr_by_size_factors( + normalized = normalize( adata=adata, - norm_method=np.log1p, + # norm_method=np.log1p, ) # Assert that the output is a valid AnnData object @@ -316,10 +315,8 @@ def test_normalize_cell_expr_by_size_factors(): assert normalized.layers["X_spliced"].shape == (3, 2) # Assert that the normalization was applied correctly - assert np.allclose(normalized.X, np.log1p(X / adata.obs["Size_Factor"].values[:, None])) - assert np.allclose( - normalized.layers["X_spliced"].toarray(), np.log1p(X / adata.obs["spliced_Size_Factor"].values[:, None]) - ) + assert np.allclose(normalized.X, (X / adata.obs["Size_Factor"].values[:, None])) + assert np.allclose(normalized.layers["X_spliced"].toarray(), (X / adata.obs["spliced_Size_Factor"].values[:, None])) def test_regress_out(): @@ -361,6 +358,6 @@ def test_regress_out(): # test_highest_frac_genes_plot_prefix_list(adata.copy()) # test_recipe_monocle_feature_selection_layer_simple0() # test_gene_selection_method() - # test_normalize_cell_expr_by_size_factors() + test_normalize() # test_regress_out() pass