From 1b361ed98858a01a64fd95c116248c9b830f25b4 Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 25 Oct 2023 17:25:32 -0400 Subject: [PATCH 01/13] create DKM select_layer_chunked_data --- dynamo/configuration.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/dynamo/configuration.py b/dynamo/configuration.py index 4a48ed33f..a6be5a9a7 100755 --- a/dynamo/configuration.py +++ b/dynamo/configuration.py @@ -38,6 +38,15 @@ class DynamoAdataKeyManager: PROTEIN_LAYER = "protein" X_PCA = "X_pca" + def _select_layer_chunked_data(adata: AnnData, layer: str, start: int, end: int) -> Tuple[np.ndarray, int, int]: + """This utility is a helper function to select chunked layer data.""" + if layer == DynamoAdataKeyManager.X_LAYER: + return (adata.X[start:end], start, end) + elif layer == DynamoAdataKeyManager.PROTEIN_LAYER: + return (adata.obsm["protein"][start:end], start, end) if "protein" in adata.obsm_keys() else None + else: + return (adata.layers[layer][start:end], start, end) + def gen_new_layer_key(layer_name, key, sep="_") -> str: """utility function for returning a new key name for a specific layer. By convention layer_name should not have the separator as the last character.""" if layer_name == "": @@ -83,6 +92,20 @@ def select_layer_data(adata: AnnData, layer: str, copy=False) -> pd.DataFrame: return res_data.copy() return res_data + def select_layer_chunked_data(adata: AnnData, layer: str, chunk_size: int) -> List[Tuple[np.ndarray, int, int]]: + """This utility provides a unified interface for selecting chunked layer data.""" + if layer is None: + layer = DynamoAdataKeyManager.X_LAYER + + start = 0 + n = adata.n_obs + for _ in range(int(n // chunk_size)): + end = start + chunk_size + yield DynamoAdataKeyManager._select_layer_chunked_data(adata=adata, layer=layer, start=start, end=end) + start = end + if start < n: + yield DynamoAdataKeyManager._select_layer_chunked_data(adata=adata, layer=layer, start=start, end=n) + def set_layer_data(adata: AnnData, layer: str, vals: np.array, var_indices: np.array = None): if var_indices is None: var_indices = slice(None) From d33a1f715406d499b0b5ef3ff06397ed95a2f6b8 Mon Sep 17 00:00:00 2001 From: sichao Date: Thu, 26 Oct 2023 11:51:18 -0400 Subject: [PATCH 02/13] implement per gene chunked helper --- dynamo/configuration.py | 83 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 8 deletions(-) diff --git a/dynamo/configuration.py b/dynamo/configuration.py index a6be5a9a7..3052e5a6f 100755 --- a/dynamo/configuration.py +++ b/dynamo/configuration.py @@ -38,14 +38,21 @@ class DynamoAdataKeyManager: PROTEIN_LAYER = "protein" X_PCA = "X_pca" - def _select_layer_chunked_data(adata: AnnData, layer: str, start: int, end: int) -> Tuple[np.ndarray, int, int]: + def _select_layer_chunked_data( + adata: AnnData, + layer: str, + cell_start: int, + cell_end: int, + gene_start: int, + gene_end: int, + ) -> np.ndarray: """This utility is a helper function to select chunked layer data.""" if layer == DynamoAdataKeyManager.X_LAYER: - return (adata.X[start:end], start, end) + return adata.X[cell_start:cell_end, gene_start:gene_end] elif layer == DynamoAdataKeyManager.PROTEIN_LAYER: - return (adata.obsm["protein"][start:end], start, end) if "protein" in adata.obsm_keys() else None + return adata.obsm["protein"][cell_start:cell_end, gene_start:gene_end] if "protein" in adata.obsm_keys() else None else: - return (adata.layers[layer][start:end], start, end) + return adata.layers[layer][cell_start:cell_end, gene_start:gene_end] def gen_new_layer_key(layer_name, key, sep="_") -> str: """utility function for returning a new key name for a specific layer. By convention layer_name should not have the separator as the last character.""" @@ -92,19 +99,79 @@ def select_layer_data(adata: AnnData, layer: str, copy=False) -> pd.DataFrame: return res_data.copy() return res_data - def select_layer_chunked_data(adata: AnnData, layer: str, chunk_size: int) -> List[Tuple[np.ndarray, int, int]]: - """This utility provides a unified interface for selecting chunked layer data.""" + def select_layer_cell_chunked_data(adata: AnnData, layer: str, chunk_size: int) -> List[Tuple[np.ndarray, int, int]]: + """This utility provides a unified interface for selecting cells chunked layer data.""" if layer is None: layer = DynamoAdataKeyManager.X_LAYER + n_genes = adata.n_vars start = 0 n = adata.n_obs for _ in range(int(n // chunk_size)): end = start + chunk_size - yield DynamoAdataKeyManager._select_layer_chunked_data(adata=adata, layer=layer, start=start, end=end) + yield ( + DynamoAdataKeyManager._select_layer_chunked_data( + adata=adata, + layer=layer, + cell_start=start, + cell_end=end, + gene_start=0, + gene_end=n_genes, + ), + start, + end, + ) start = end if start < n: - yield DynamoAdataKeyManager._select_layer_chunked_data(adata=adata, layer=layer, start=start, end=n) + yield ( + DynamoAdataKeyManager._select_layer_chunked_data( + adata=adata, + layer=layer, + cell_start=start, + cell_end=n, + gene_start=0, + gene_end=n_genes, + ), + start, + n, + ) + + def select_layer_gene_chunked_data(adata: AnnData, layer: str, chunk_size: int) -> List[Tuple[np.ndarray, int, int]]: + """This utility provides a unified interface for selecting genes chunked layer data.""" + if layer is None: + layer = DynamoAdataKeyManager.X_LAYER + + n_cells = adata.n_obs + start = 0 + n = adata.n_vars + for _ in range(int(n // chunk_size)): + end = start + chunk_size + yield ( + DynamoAdataKeyManager._select_layer_chunked_data( + adata=adata, + layer=layer, + cell_start=0, + cell_end=n_cells, + gene_start=start, + gene_end=end, + ), + start, + end, + ) + start = end + if start < n: + yield ( + DynamoAdataKeyManager._select_layer_chunked_data( + adata=adata, + layer=layer, + cell_start=0, + cell_end=n_cells, + gene_start=start, + gene_end=n, + ), + start, + n, + ) def set_layer_data(adata: AnnData, layer: str, vals: np.array, var_indices: np.array = None): if var_indices is None: From db3342ee5f35d62d0287e0596f3411a1765c25d7 Mon Sep 17 00:00:00 2001 From: sichao Date: Thu, 26 Oct 2023 15:19:49 -0400 Subject: [PATCH 03/13] implement chunked seurat gene selection --- dynamo/preprocessing/gene_selection.py | 44 +++++++++++++++----------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 6314a5efe..6abb68773 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -527,6 +527,7 @@ def select_genes_by_seurat_recipe( nan_replace_val: Union[float, None] = None, n_top_genes: int = 2000, algorithm: Literal["seurat_dispersion", "fano_dispersion"] = "seurat_dispersion", + chunk_size: Optional[int] = None, seurat_min_disp: Union[float, None] = None, seurat_max_disp: Union[float, None] = None, seurat_min_mean: Union[float, None] = None, @@ -569,25 +570,40 @@ def select_genes_by_seurat_recipe( if len(pass_filter_genes) != len(set(pass_filter_genes)): main_warning("gene names are not unique, please check your preprocessing procedure.") - subset_adata = adata[:, pass_filter_genes] if n_top_genes is None: main_info("n_top_genes is None, reserve all genes and add filter gene information") n_top_genes = adata.n_vars - layer_mat = DKM.select_layer_data(subset_adata, layer) - if nan_replace_val: - main_info("replacing nan values with: %s" % (nan_replace_val)) - _mask = get_nan_or_inf_data_bool_mask(layer_mat) - layer_mat[_mask] = nan_replace_val + + chunk_size = chunk_size if chunk_size is not None else adata.n_vars if algorithm == "seurat_dispersion": + chunked_layer_mats = DKM.select_layer_gene_chunked_data(adata[:, pass_filter_genes], layer, chunk_size=chunk_size) + mean = np.zeros(len(pass_filter_genes)) + variance = np.zeros(len(pass_filter_genes)) + + for mat_data in chunked_layer_mats: + layer_mat = mat_data[0] + + if nan_replace_val: + main_info("replacing nan values with: %s" % (nan_replace_val)) + _mask = get_nan_or_inf_data_bool_mask(layer_mat) + layer_mat[_mask] = nan_replace_val + + chunked_mean, chunked_var = seurat_get_mean_var(layer_mat) + + mean[mat_data[1]:mat_data[2]] = chunked_mean + variance[mat_data[1]:mat_data[2]] = chunked_var + mean, variance, highly_variable_mask = select_genes_by_seurat_dispersion( - layer_mat, + mean=mean, + variance=variance, min_disp=seurat_min_disp, max_disp=seurat_max_disp, min_mean=seurat_min_mean, max_mean=seurat_max_mean, n_top_genes=n_top_genes, ) + main_info_insert_adata_var(DKM.VAR_GENE_MEAN_KEY) main_info_insert_adata_var(DKM.VAR_GENE_VAR_KEY) main_info_insert_adata_var(DKM.VAR_GENE_HIGHLY_VARIABLE_KEY) @@ -603,14 +619,6 @@ def select_genes_by_seurat_recipe( adata.var[DKM.VAR_GENE_HIGHLY_VARIABLE_KEY][pass_filter_genes] = highly_variable_mask adata.var[DKM.VAR_USE_FOR_PCA][pass_filter_genes] = highly_variable_mask - elif algorithm == "fano_dispersion": - select_genes_monocle(adata, layer=layer, sort_by=algorithm) - # adata = select_genes_by_svr( - # adata, - # layers=layer, - # algorithm=algorithm, - # ) - # filter_bool = get_svr_filter(adata, layer=layer, n_top_genes=n_top_genes, return_adata=False) else: raise ValueError(f"The algorithm {algorithm} is not existed") @@ -621,7 +629,8 @@ def select_genes_by_seurat_recipe( def select_genes_by_seurat_dispersion( - sparse_layer_mat: csr_matrix, + mean: np.ndarray, + variance: np.ndarray, n_bins: int = 20, log_mean_and_dispersion: bool = True, min_disp: float = None, @@ -661,9 +670,6 @@ def select_genes_by_seurat_dispersion( if max_mean is None: max_mean = 3 - # mean, variance, dispersion = calc_mean_var_dispersion_sparse(sparse_layer_mat) # Dead - sc_mean, sc_var = seurat_get_mean_var(sparse_layer_mat) - mean, variance = sc_mean, sc_var dispersion = variance / mean if log_mean_and_dispersion: From 20c80fb1b82b081c0da877189519b50e6d669ed0 Mon Sep 17 00:00:00 2001 From: sichao Date: Thu, 26 Oct 2023 15:25:11 -0400 Subject: [PATCH 04/13] update docstr for seurat gene selection --- dynamo/preprocessing/gene_selection.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 6abb68773..9f821f0ea 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -546,7 +546,8 @@ def select_genes_by_seurat_recipe( layer: the key of a sparse matrix in adata. Defaults to DKM.X_LAYER. nan_replace_val: your choice of value to replace values in layer. Defaults to None. n_top_genes: number of genes to select as highly variable genes. Defaults to 2000. - algorithm: a method for selecting genes; must be one of "seurat_dispersion" or "fano". + algorithm: a method for selecting genes; Only support "seurat_dispersion" for now. + chunk_size: the size of chunked data. Defaults to None. seurat_min_disp: seurat dispersion min cutoff. Defaults to None. seurat_max_disp: seurat dispersion max cutoff. Defaults to None. seurat_min_mean: seurat mean min cutoff. Defaults to None. @@ -642,7 +643,8 @@ def select_genes_by_seurat_dispersion( """Apply seurat's gene selection recipe by cutoffs. Args: - sparse_layer_mat: the sparse matrix used for gene selection. + mean: mean of the columns for each gene. + variance: variance of the columns for each gene. n_bins: the number of bins for normalization. Defaults to 20. log_mean_and_dispersion: whether log the gene expression values before calculating the dispersion values. Defaults to True. From 736c6e6998559a40cade1a55decb23b870dec620 Mon Sep 17 00:00:00 2001 From: sichao Date: Thu, 26 Oct 2023 16:45:55 -0400 Subject: [PATCH 05/13] refactor DKM chunk helper --- dynamo/configuration.py | 135 ++++++++++--------------- dynamo/preprocessing/gene_selection.py | 7 +- 2 files changed, 57 insertions(+), 85 deletions(-) diff --git a/dynamo/configuration.py b/dynamo/configuration.py index 3052e5a6f..129fe30e3 100755 --- a/dynamo/configuration.py +++ b/dynamo/configuration.py @@ -38,21 +38,33 @@ class DynamoAdataKeyManager: PROTEIN_LAYER = "protein" X_PCA = "X_pca" - def _select_layer_chunked_data( - adata: AnnData, - layer: str, - cell_start: int, - cell_end: int, - gene_start: int, - gene_end: int, + def _select_layer_cell_chunked_data( + mat: np.ndarray, + chunk_size: int, ) -> np.ndarray: - """This utility is a helper function to select chunked layer data.""" - if layer == DynamoAdataKeyManager.X_LAYER: - return adata.X[cell_start:cell_end, gene_start:gene_end] - elif layer == DynamoAdataKeyManager.PROTEIN_LAYER: - return adata.obsm["protein"][cell_start:cell_end, gene_start:gene_end] if "protein" in adata.obsm_keys() else None - else: - return adata.layers[layer][cell_start:cell_end, gene_start:gene_end] + """Select layer data in cell chunks based on chunk_size.""" + start = 0 + n = mat.shape[0] + for _ in range(int(n // chunk_size)): + end = start + chunk_size + yield (mat[start:end, :], start, end) + start = end + if start < n: + yield (mat[start:n, :], start, n) + + def _select_layer_gene_chunked_data( + mat: np.ndarray, + chunk_size: int, + ) -> np.ndarray: + """Select layer data in gene chunks based on chunk_size.""" + start = 0 + n = mat.shape[1] + for _ in range(int(n // chunk_size)): + end = start + chunk_size + yield (mat[:, start:end], start, end) + start = end + if start < n: + yield (mat[:, start:n], start, n) def gen_new_layer_key(layer_name, key, sep="_") -> str: """utility function for returning a new key name for a specific layer. By convention layer_name should not have the separator as the last character.""" @@ -99,79 +111,34 @@ def select_layer_data(adata: AnnData, layer: str, copy=False) -> pd.DataFrame: return res_data.copy() return res_data - def select_layer_cell_chunked_data(adata: AnnData, layer: str, chunk_size: int) -> List[Tuple[np.ndarray, int, int]]: - """This utility provides a unified interface for selecting cells chunked layer data.""" - if layer is None: - layer = DynamoAdataKeyManager.X_LAYER - - n_genes = adata.n_vars - start = 0 - n = adata.n_obs - for _ in range(int(n // chunk_size)): - end = start + chunk_size - yield ( - DynamoAdataKeyManager._select_layer_chunked_data( - adata=adata, - layer=layer, - cell_start=start, - cell_end=end, - gene_start=0, - gene_end=n_genes, - ), - start, - end, - ) - start = end - if start < n: - yield ( - DynamoAdataKeyManager._select_layer_chunked_data( - adata=adata, - layer=layer, - cell_start=start, - cell_end=n, - gene_start=0, - gene_end=n_genes, - ), - start, - n, - ) - - def select_layer_gene_chunked_data(adata: AnnData, layer: str, chunk_size: int) -> List[Tuple[np.ndarray, int, int]]: - """This utility provides a unified interface for selecting genes chunked layer data.""" + def select_layer_chunked_data( + adata: AnnData, + layer: str, + chunk_size: int, + chunk_mode: str = "cell", + ) -> List[Tuple[np.ndarray, int, int]]: + """This utility provides a unified interface for selecting chunked layer data.""" if layer is None: layer = DynamoAdataKeyManager.X_LAYER - n_cells = adata.n_obs - start = 0 - n = adata.n_vars - for _ in range(int(n // chunk_size)): - end = start + chunk_size - yield ( - DynamoAdataKeyManager._select_layer_chunked_data( - adata=adata, - layer=layer, - cell_start=0, - cell_end=n_cells, - gene_start=start, - gene_end=end, - ), - start, - end, - ) - start = end - if start < n: - yield ( - DynamoAdataKeyManager._select_layer_chunked_data( - adata=adata, - layer=layer, - cell_start=0, - cell_end=n_cells, - gene_start=start, - gene_end=n, - ), - start, - n, - ) + if chunk_mode == "cell": + if layer == DynamoAdataKeyManager.X_LAYER: + return DynamoAdataKeyManager._select_layer_cell_chunked_data(adata.X, chunk_size=chunk_size) + elif layer == DynamoAdataKeyManager.PROTEIN_LAYER: + return DynamoAdataKeyManager._select_layer_cell_chunked_data( + adata.obsm["protein"], chunk_size=chunk_size) if "protein" in adata.obsm_keys() else None + else: + return DynamoAdataKeyManager._select_layer_cell_chunked_data(adata.layers[layer], chunk_size=chunk_size) + elif chunk_mode == "gene": + if layer == DynamoAdataKeyManager.X_LAYER: + return DynamoAdataKeyManager._select_layer_gene_chunked_data(adata.X, chunk_size=chunk_size) + elif layer == DynamoAdataKeyManager.PROTEIN_LAYER: + return DynamoAdataKeyManager._select_layer_gene_chunked_data( + adata.obsm["protein"], chunk_size=chunk_size) if "protein" in adata.obsm_keys() else None + else: + return DynamoAdataKeyManager._select_layer_gene_chunked_data(adata.layers[layer], chunk_size=chunk_size) + else: + raise NotImplementedError("chunk_mode %s not implemented." % (chunk_mode)) def set_layer_data(adata: AnnData, layer: str, vals: np.array, var_indices: np.array = None): if var_indices is None: diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 9f821f0ea..44f59138d 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -578,7 +578,12 @@ def select_genes_by_seurat_recipe( chunk_size = chunk_size if chunk_size is not None else adata.n_vars if algorithm == "seurat_dispersion": - chunked_layer_mats = DKM.select_layer_gene_chunked_data(adata[:, pass_filter_genes], layer, chunk_size=chunk_size) + chunked_layer_mats = DKM.select_layer_chunked_data( + adata[:, pass_filter_genes], + layer, + chunk_size=chunk_size, + chunk_mode="gene", + ) mean = np.zeros(len(pass_filter_genes)) variance = np.zeros(len(pass_filter_genes)) From 1343b50a8bd3311609ce4f91dd85f0a9c62c39bc Mon Sep 17 00:00:00 2001 From: sichao Date: Thu, 26 Oct 2023 18:53:21 -0400 Subject: [PATCH 06/13] implement chunked calc_size_factor --- dynamo/preprocessing/normalization.py | 36 +++++++++++++++++++-------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/dynamo/preprocessing/normalization.py b/dynamo/preprocessing/normalization.py index 991f599cf..d72245d39 100644 --- a/dynamo/preprocessing/normalization.py +++ b/dynamo/preprocessing/normalization.py @@ -32,6 +32,7 @@ def calc_sz_factor( splicing_total_layers: bool = False, X_total_layers: bool = False, locfunc: Callable = np.nanmean, + chunk_size: Optional[int] = None, round_exprs: bool = False, method: Literal["mean-geometric-mean-total", "geometric", "median"] = "median", scale_to: Union[float, None] = None, @@ -117,6 +118,7 @@ def calc_sz_factor( round_exprs, method, locfunc, + chunk_size=chunk_size, total_layers=None, scale_to=scale_to, ) @@ -127,6 +129,7 @@ def calc_sz_factor( round_exprs, method, locfunc, + chunk_size=chunk_size, total_layers=total_layers, scale_to=scale_to, ) @@ -351,6 +354,7 @@ def sz_util( round_exprs: bool, method: Literal["mean-geometric-mean-total", "geometric", "median"], locfunc: Callable, + chunk_size: Optional[int] = None, total_layers: List[str] = None, CM: pd.DataFrame = None, scale_to: Union[float, None] = None, @@ -389,19 +393,29 @@ def sz_util( extend_layers=False, ) - CM = DKM.select_layer_data(adata, layer) if CM is None else CM - if CM is None: - return None, None + chunk_size = chunk_size if chunk_size is not None else adata.n_obs + chunked_CMs = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) if CM is None else CM - if round_exprs: - main_debug("rounding expression data of layer: %s during size factor calculation" % (layer)) - if issparse(CM): - CM.data = np.round(CM.data, 0) - else: - CM = CM.round().astype("int") + cell_total = np.zeros(adata.n_obs) + + for CM_data in chunked_CMs: + CM = CM_data[0] + + CM = DKM.select_layer_data(adata, layer) if CM is None else CM + if CM is None: + return None, None + + if round_exprs: + main_debug("rounding expression data of layer: %s during size factor calculation" % (layer)) + if issparse(CM): + CM.data = np.round(CM.data, 0) + else: + CM = CM.round().astype("int") + + chunk_cell_total = CM.sum(axis=1).A1 if issparse(CM) else CM.sum(axis=1) + chunk_cell_total += chunk_cell_total == 0 # avoid infinity value after log (0) - cell_total = CM.sum(axis=1).A1 if issparse(CM) else CM.sum(axis=1) - cell_total += cell_total == 0 # avoid infinity value after log (0) + cell_total[CM_data[1]:CM_data[2]] = chunk_cell_total if method in ["mean-geometric-mean-total", "geometric"]: sfs = cell_total / (np.exp(locfunc(np.log(cell_total))) if scale_to is None else scale_to) From ff0f2347ccf42d525c654f3be6bcd0d460ec6db1 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 27 Oct 2023 11:18:12 -0400 Subject: [PATCH 07/13] optimize DKM chunk helper --- dynamo/configuration.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/dynamo/configuration.py b/dynamo/configuration.py index 129fe30e3..969e7df9f 100755 --- a/dynamo/configuration.py +++ b/dynamo/configuration.py @@ -1,5 +1,5 @@ import warnings -from typing import List, Optional, Tuple, Union +from typing import List, Generator, Optional, Tuple, Union import colorcet import matplotlib @@ -37,11 +37,12 @@ class DynamoAdataKeyManager: X_LAYER = "X" PROTEIN_LAYER = "protein" X_PCA = "X_pca" + RAW = "raw" def _select_layer_cell_chunked_data( mat: np.ndarray, chunk_size: int, - ) -> np.ndarray: + ) -> Generator: """Select layer data in cell chunks based on chunk_size.""" start = 0 n = mat.shape[0] @@ -55,7 +56,7 @@ def _select_layer_cell_chunked_data( def _select_layer_gene_chunked_data( mat: np.ndarray, chunk_size: int, - ) -> np.ndarray: + ) -> Generator: """Select layer data in gene chunks based on chunk_size.""" start = 0 n = mat.shape[1] @@ -116,7 +117,7 @@ def select_layer_chunked_data( layer: str, chunk_size: int, chunk_mode: str = "cell", - ) -> List[Tuple[np.ndarray, int, int]]: + ) -> Generator: """This utility provides a unified interface for selecting chunked layer data.""" if layer is None: layer = DynamoAdataKeyManager.X_LAYER @@ -124,6 +125,8 @@ def select_layer_chunked_data( if chunk_mode == "cell": if layer == DynamoAdataKeyManager.X_LAYER: return DynamoAdataKeyManager._select_layer_cell_chunked_data(adata.X, chunk_size=chunk_size) + elif layer == DynamoAdataKeyManager.RAW: + return DynamoAdataKeyManager._select_layer_cell_chunked_data(adata.raw.X, chunk_size=chunk_size) elif layer == DynamoAdataKeyManager.PROTEIN_LAYER: return DynamoAdataKeyManager._select_layer_cell_chunked_data( adata.obsm["protein"], chunk_size=chunk_size) if "protein" in adata.obsm_keys() else None @@ -132,6 +135,8 @@ def select_layer_chunked_data( elif chunk_mode == "gene": if layer == DynamoAdataKeyManager.X_LAYER: return DynamoAdataKeyManager._select_layer_gene_chunked_data(adata.X, chunk_size=chunk_size) + elif layer == DynamoAdataKeyManager.RAW: + return DynamoAdataKeyManager._select_layer_gene_chunked_data(adata.raw.X, chunk_size=chunk_size) elif layer == DynamoAdataKeyManager.PROTEIN_LAYER: return DynamoAdataKeyManager._select_layer_gene_chunked_data( adata.obsm["protein"], chunk_size=chunk_size) if "protein" in adata.obsm_keys() else None From 18672065d6eef678d2748cc1dff8c12b29656de1 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 27 Oct 2023 15:03:26 -0400 Subject: [PATCH 08/13] implement chunk normalization --- dynamo/preprocessing/normalization.py | 83 ++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 13 deletions(-) diff --git a/dynamo/preprocessing/normalization.py b/dynamo/preprocessing/normalization.py index d72245d39..bcf432dec 100644 --- a/dynamo/preprocessing/normalization.py +++ b/dynamo/preprocessing/normalization.py @@ -195,6 +195,41 @@ def get_sz_exprs( return szfactors, CM +def get_sz_factors( + adata: anndata.AnnData, layer: str, total_szfactor: Union[str, None] = None +) -> Tuple[np.ndarray, npt.ArrayLike]: + """Get the size factor from an AnnData object. + + Args: + adata: an AnnData object. + layer: the layer for which to get the size factor. + total_szfactor: the key-name for total size factor entry in `adata.obs`. If not None, would override the layer + selected. Defaults to None. + + Returns: + The queried size factor. + """ + + if layer == "raw": + szfactors = adata.obs[layer + "Size_Factor"].values[:, None] + elif layer == "X": + szfactors = adata.obs["Size_Factor"].values[:, None] + elif layer == "protein": + if "protein" in adata.obsm_keys(): + szfactors = adata.obs["protein_Size_Factor"].values[:, None] + else: + szfactors = None + else: + szfactors = adata.obs[layer + "_Size_Factor"].values[:, None] + + if total_szfactor is not None and total_szfactor in adata.obs.keys(): + szfactors = adata.obs[total_szfactor][:, None] + elif total_szfactor is not None: + main_warning("`total_szfactor` is not `None` and it is not in adata object.") + + return szfactors + + def normalize( adata: anndata.AnnData, layers: str = "all", @@ -202,6 +237,7 @@ def normalize( splicing_total_layers: bool = False, X_total_layers: bool = False, keep_filtered: bool = True, + chunk_size: Optional[int] = None, recalc_sz: bool = False, sz_method: Literal["mean-geometric-mean-total", "geometric", "median"] = "median", scale_to: Union[float, None] = None, @@ -240,12 +276,15 @@ def normalize( adata.obs = adata.obs.loc[:, ~adata.obs.columns.str.contains("Size_Factor")] + chunk_size = chunk_size if chunk_size is not None else adata.n_obs + layers = DKM.get_available_layer_keys(adata, layers) calc_sz_factor( adata, layers=layers, locfunc=np.nanmean, + chunk_size=chunk_size, round_exprs=False, method=sz_method, scale_to=scale_to, @@ -259,14 +298,17 @@ def normalize( main_debug("size factor normalize following layers: " + str(layers)) for layer in layers: if layer in excluded_layers: - szfactors, CM = get_sz_exprs(adata, layer, total_szfactor=None) + szfactors = get_sz_factors(adata, layer, total_szfactor=None) else: - szfactors, CM = get_sz_exprs(adata, layer, total_szfactor=total_szfactor) + szfactors = get_sz_factors(adata, layer, total_szfactor=total_szfactor) if layer == "protein": """This normalization implements the centered log-ratio (CLR) normalization from Seurat which is computed for each gene (M Stoeckius, 2017). """ + CMs_data = DKM.select_layer_chunked_data(adata, layer, chunk_size=adata.n_obs) + CM = next(CMs_data)[0] + CM = CM.T n_feature = CM.shape[1] @@ -279,18 +321,33 @@ def normalize( CM[i] = res CM = CM.T + + if "protein" in adata.obsm_keys(): + main_info_insert_adata_obsm("X_protein") + adata.obsm["X_protein"] = CM + else: + main_info_insert_adata_layer("X_" + layer) + adata.layers["X_" + layer] = CM else: - CM = size_factor_normalize(CM, szfactors) - - if layer in ["raw", "X"]: - main_debug("set adata to normalized data.") - adata.X = CM - elif layer == "protein" and "protein" in adata.obsm_keys(): - main_info_insert_adata_obsm("X_protein") - adata.obsm["X_protein"] = CM - else: - main_info_insert_adata_layer("X_" + layer) - adata.layers["X_" + layer] = CM + CMs_data = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) + + if layer in ["raw", "X"]: + main_debug("set adata to normalized data.") + for CM_data in CMs_data: + CM = CM_data[0] + CM = size_factor_normalize(CM, szfactors[CM_data[1]:CM_data[2]]) + adata.X[CM_data[1]:CM_data[2]] = CM + else: + main_info_insert_adata_layer("X_" + layer) + + if issparse(adata.layers[layer]): + adata.layers["X_" + layer] = csr_matrix(np.zeros(adata.layers[layer].shape)) + else: + adata.layers["X_" + layer] = np.zeros(adata.layers[layer].shape) + for CM_data in CMs_data: + CM = CM_data[0] + CM = size_factor_normalize(CM, szfactors[CM_data[1]:CM_data[2]]) + adata.layers["X_" + layer][CM_data[1]:CM_data[2]] = CM return adata From b86242b09bf435b0903ae1543b9b0d75800c2761 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 27 Oct 2023 15:16:09 -0400 Subject: [PATCH 09/13] update docstr for normalization --- dynamo/preprocessing/normalization.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dynamo/preprocessing/normalization.py b/dynamo/preprocessing/normalization.py index bcf432dec..966706208 100644 --- a/dynamo/preprocessing/normalization.py +++ b/dynamo/preprocessing/normalization.py @@ -54,6 +54,7 @@ def calc_sz_factor( Defaults to False. X_total_layers: whether to also normalize adata.X by size factor from total RNA. Defaults to False. locfunc: the function to normalize the data. Defaults to np.nanmean. + chunk_size: the number of cells to be processed at a time. Defaults to None. round_exprs: whether the gene expression should be rounded into integers. Defaults to False. method: the method used to calculate the expected total reads / UMI used in size factor calculation. Only `mean-geometric-mean-total` / `geometric` and `median` are supported. When `mean-geometric-mean-total` is @@ -257,6 +258,7 @@ def normalize( X_total_layers: whether to also normalize adata.X by size factor from total RNA. Defaults to False. keep_filtered: whether we will only store feature genes in the adata object. If it is False, size factor will be recalculated only for the selected feature genes. Defaults to True. + chunk_size: the number of cells to be processed at a time. Defaults to None. recalc_sz: whether we need to recalculate size factor based on selected genes before normalization. Defaults to False. sz_method: the method used to calculate the expected total reads / UMI used in size factor calculation. Only @@ -428,6 +430,7 @@ def sz_util( used, `locfunc` will be replaced with `np.nanmedian`. When `mean` is used, `locfunc` will be replaced with `np.nanmean`. Defaults to "median". locfunc: the function to normalize the data. + chunk_size: the number of cells to be processed at a time. Defaults to None. total_layers: the layer(s) that can be summed up to get the total mRNA. For example, ["spliced", "unspliced"], ["uu", "ul", "su", "sl"] or ["new", "old"], etc. Defaults to None. CM: the data to operate on, overriding the layer. Defaults to None. From 121cb9a1ec30ac4bf6517f6dab1c5a92c233be38 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 27 Oct 2023 16:49:39 -0400 Subject: [PATCH 10/13] remove unnecessary operations --- dynamo/preprocessing/normalization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dynamo/preprocessing/normalization.py b/dynamo/preprocessing/normalization.py index 966706208..5b2aad4b0 100644 --- a/dynamo/preprocessing/normalization.py +++ b/dynamo/preprocessing/normalization.py @@ -335,10 +335,12 @@ def normalize( if layer in ["raw", "X"]: main_debug("set adata to normalized data.") + for CM_data in CMs_data: CM = CM_data[0] CM = size_factor_normalize(CM, szfactors[CM_data[1]:CM_data[2]]) adata.X[CM_data[1]:CM_data[2]] = CM + else: main_info_insert_adata_layer("X_" + layer) @@ -346,6 +348,7 @@ def normalize( adata.layers["X_" + layer] = csr_matrix(np.zeros(adata.layers[layer].shape)) else: adata.layers["X_" + layer] = np.zeros(adata.layers[layer].shape) + for CM_data in CMs_data: CM = CM_data[0] CM = size_factor_normalize(CM, szfactors[CM_data[1]:CM_data[2]]) @@ -443,8 +446,6 @@ def sz_util( A tuple (sfs, cell_total) where sfs is the size factors and cell_total is the initial cell size. """ - adata = adata.copy() - if layer == "_total_" and "_total_" not in adata.layers.keys(): if total_layers is not None: total_layers, _ = DKM.aggregate_layers_into_total( @@ -461,7 +462,6 @@ def sz_util( for CM_data in chunked_CMs: CM = CM_data[0] - CM = DKM.select_layer_data(adata, layer) if CM is None else CM if CM is None: return None, None From 5d824101aada7252ce5cc7158002a2db48801ccd Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 27 Oct 2023 17:18:34 -0400 Subject: [PATCH 11/13] adjust precision --- dynamo/preprocessing/gene_selection.py | 4 ++-- dynamo/preprocessing/normalization.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 44f59138d..40bcc5c9e 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -584,8 +584,8 @@ def select_genes_by_seurat_recipe( chunk_size=chunk_size, chunk_mode="gene", ) - mean = np.zeros(len(pass_filter_genes)) - variance = np.zeros(len(pass_filter_genes)) + mean = np.zeros(len(pass_filter_genes), dtype=adata.X.dtype) + variance = np.zeros(len(pass_filter_genes), dtype=adata.X.dtype) for mat_data in chunked_layer_mats: layer_mat = mat_data[0] diff --git a/dynamo/preprocessing/normalization.py b/dynamo/preprocessing/normalization.py index 5b2aad4b0..a6343ef2f 100644 --- a/dynamo/preprocessing/normalization.py +++ b/dynamo/preprocessing/normalization.py @@ -345,9 +345,9 @@ def normalize( main_info_insert_adata_layer("X_" + layer) if issparse(adata.layers[layer]): - adata.layers["X_" + layer] = csr_matrix(np.zeros(adata.layers[layer].shape)) + adata.layers["X_" + layer] = csr_matrix(np.zeros(adata.layers[layer].shape, dtype=adata.X.dtype)) else: - adata.layers["X_" + layer] = np.zeros(adata.layers[layer].shape) + adata.layers["X_" + layer] = np.zeros(adata.layers[layer].shape, dtype=adata.X.dtype) for CM_data in CMs_data: CM = CM_data[0] @@ -457,7 +457,7 @@ def sz_util( chunk_size = chunk_size if chunk_size is not None else adata.n_obs chunked_CMs = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) if CM is None else CM - cell_total = np.zeros(adata.n_obs) + cell_total = np.zeros(adata.n_obs, dtype=adata.X.dtype) for CM_data in chunked_CMs: CM = CM_data[0] @@ -477,6 +477,8 @@ def sz_util( cell_total[CM_data[1]:CM_data[2]] = chunk_cell_total + cell_total = cell_total.astype(int) if np.all(cell_total % 1 == 0) else cell_total + if method in ["mean-geometric-mean-total", "geometric"]: sfs = cell_total / (np.exp(locfunc(np.log(cell_total))) if scale_to is None else scale_to) elif method == "median": From eb2553299ef70bf172a46122e2dd00beda25730b Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 27 Oct 2023 18:36:06 -0400 Subject: [PATCH 12/13] change precision back to f64 --- dynamo/preprocessing/normalization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dynamo/preprocessing/normalization.py b/dynamo/preprocessing/normalization.py index a6343ef2f..4fc7a921a 100644 --- a/dynamo/preprocessing/normalization.py +++ b/dynamo/preprocessing/normalization.py @@ -345,9 +345,9 @@ def normalize( main_info_insert_adata_layer("X_" + layer) if issparse(adata.layers[layer]): - adata.layers["X_" + layer] = csr_matrix(np.zeros(adata.layers[layer].shape, dtype=adata.X.dtype)) + adata.layers["X_" + layer] = csr_matrix(np.zeros(adata.layers[layer].shape)) else: - adata.layers["X_" + layer] = np.zeros(adata.layers[layer].shape, dtype=adata.X.dtype) + adata.layers["X_" + layer] = np.zeros(adata.layers[layer].shape) for CM_data in CMs_data: CM = CM_data[0] From d2fed884eac239dcdff19e4fc65ec99dd482b4c8 Mon Sep 17 00:00:00 2001 From: sichao Date: Tue, 31 Oct 2023 16:30:08 -0400 Subject: [PATCH 13/13] add data type transformation for integer X input --- dynamo/preprocessing/normalization.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dynamo/preprocessing/normalization.py b/dynamo/preprocessing/normalization.py index 09320da56..b51619ba3 100644 --- a/dynamo/preprocessing/normalization.py +++ b/dynamo/preprocessing/normalization.py @@ -245,6 +245,7 @@ def normalize( recalc_sz: bool = False, sz_method: Literal["mean-geometric-mean-total", "geometric", "median"] = "median", scale_to: Union[float, None] = None, + transform_int_to_float: bool = True, ) -> anndata.AnnData: """Normalize the gene expression value for the AnnData object. @@ -270,6 +271,8 @@ def normalize( used, `locfunc` will be replaced with `np.nanmedian`. When `mean` is used, `locfunc` will be replaced with `np.nanmean`. Defaults to "median". scale_to: the final total expression for each cell that will be scaled to. Defaults to None. + transform_int_to_float: whether to transform the adata.X from int to float32 for normalization. Defaults to + True. Returns: An updated anndata object that are updated with normalized expression values for different layers. @@ -301,6 +304,11 @@ def normalize( splicing_total_layers=splicing_total_layers, ) + if "X" in layers and transform_int_to_float and adata.X.dtype == "int": + main_warning("Transforming adata.X from int to float32 for normalization. If you want to disable this, set " + "`transform_int_to_float` to False.") + adata.X = adata.X.astype("float32") + main_debug("size factor normalize following layers: " + str(layers)) for layer in layers: if layer in excluded_layers: