aristoteleo · Xiaojieqiu · May 3, 2023 · May 1, 2023 · May 2, 2023 · May 2, 2023
diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py
@@ -21,6 +21,7 @@
     select_genes_by_pearson_residuals,
 )
 from ..tools.connectivity import neighbors as default_neighbors
+from ..tools.utils import update_dict
 from .preprocess import normalize_cell_expr_by_size_factors_legacy, pca
 from .preprocessor_utils import _infer_labeling_experiment_type
 from .preprocessor_utils import (
@@ -485,7 +486,9 @@ def config_monocle_recipe(
 
         # recipe monocle log1p all raw data in normalize_by_cells (dynamo version), so we do not need extra log1p transform.
         self.use_log1p = False
-        self.regress_out_kwargs = {"obs_key": [], "gene_selection_key": "use_for_pca"}
+        self.regress_out_kwargs = update_dict(
+            {"obs_key": [], "gene_selection_key": "use_for_pca"}, self.regress_out_kwargs
+        )
         self.pca = pca
         self.pca_kwargs = {"pca_key": "X_pca"}
 
@@ -525,6 +528,7 @@ def preprocess_adata_monocle(
 
         if len(self.regress_out_kwargs["obs_key"]) > 0:
             self._regress_out(adata)
+            self.pca_kwargs.update({"layer": "regress_out"})
 
         self._pca(adata)
 
@@ -545,7 +549,9 @@ def config_seurat_recipe(self, adata: AnnData, regress_out_obs_keys: List[str] =
         self.filter_genes_by_outliers_kwargs = {"shared_count": 20}
         self.use_log1p = True
         self.log1p_kwargs = {"layers": ["X"]}
-        self.regress_out_kwargs = {"obs_key": [], "gene_selection_key": "use_for_pca"}
+        self.regress_out_kwargs = update_dict(
+            {"obs_key": [], "gene_selection_key": "use_for_pca"}, self.regress_out_kwargs
+        )
 
     def preprocess_adata_seurat(
         self, adata: AnnData, tkey: Optional[str] = None, experiment_type: Optional[str] = None
@@ -573,6 +579,7 @@ def preprocess_adata_seurat(
         self._log1p(adata)
         if len(self.regress_out_kwargs["obs_key"]) > 0:
             self._regress_out(adata)
+            self.pca_kwargs.update({"layer": "regress_out"})
         self._pca(adata)
         temp_logger.finish_progress(progress_name="preprocess by seurat recipe")
 
@@ -595,6 +602,9 @@ def config_sctransform_recipe(self, adata: AnnData) -> None:
         }
         self.select_genes_kwargs = {"inplace": True}
         self.sctransform_kwargs = {"layers": raw_layers, "n_top_genes": 2000}
+        self.regress_out_kwargs = update_dict(
+            {"obs_key": [], "gene_selection_key": "use_for_pca"}, self.regress_out_kwargs
+        )
         self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50}
 
     def preprocess_adata_sctransform(
@@ -624,6 +634,9 @@ def preprocess_adata_sctransform(
         # TODO: if inplace in select_genes is True, the following subset is unnecessary.
         adata._inplace_subset_var(adata.var["use_for_pca"])
         self.sctransform(adata, **self.sctransform_kwargs)
+        if len(self.regress_out_kwargs["obs_key"]) > 0:
+            self._regress_out(adata)
+            self.pca_kwargs.update({"layer": "regress_out"})
         self._pca(adata)
 
         temp_logger.finish_progress(progress_name="preprocess by sctransform recipe")
@@ -644,6 +657,9 @@ def config_pearson_residuals_recipe(self, adata: AnnData) -> None:
         # select layers in adata to be normalized
         normalize_layers = DKM.get_raw_data_layers(adata)
         self.normalize_selected_genes_kwargs = {"layers": normalize_layers, "copy": False}
+        self.regress_out_kwargs = update_dict(
+            {"obs_key": [], "gene_selection_key": "use_for_pca"}, self.regress_out_kwargs
+        )
         self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50}
         self.use_log1p = False
 
@@ -669,6 +685,10 @@ def preprocess_adata_pearson_residuals(
 
         self._select_genes(adata)
         self._normalize_selected_genes(adata)
+        if len(self.regress_out_kwargs["obs_key"]) > 0:
+            self._regress_out(adata)
+            self.pca_kwargs.update({"layer": "regress_out"})
+
         self._pca(adata)
 
         temp_logger.finish_progress(progress_name="preprocess by pearson residual recipe")
@@ -692,6 +712,9 @@ def config_monocle_pearson_residuals_recipe(self, adata: AnnData) -> None:
         self.normalize_selected_genes = normalize_layers_pearson_residuals
 
         self.normalize_selected_genes_kwargs = {"layers": ["X"], "copy": False}
+        self.regress_out_kwargs = update_dict(
+            {"obs_key": [], "gene_selection_key": "use_for_pca"}, self.regress_out_kwargs
+        )
 
         self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50}
         self.use_log1p = False
@@ -720,13 +743,9 @@ def preprocess_adata_monocle_pearson_residuals(
         self._normalize_by_cells(adata)
         adata.X = X_copy
         self._normalize_selected_genes(adata)
-        # use monocle to pprocess adata
-        # self.config_monocle_recipe(adata_copy)
-        # self.pca = None # do not do pca in this monocle
-        # self.preprocess_adata_monocle(adata_copy)
-        # for layer in adata_copy.layers:
-        #     if DKM.is_layer_X_key(layer):
-        #         adata.layers[layer] = adata.
+        if len(self.regress_out_kwargs["obs_key"]) > 0:
+            self._regress_out(adata)
+            self.pca_kwargs.update({"layer": "regress_out"})
 
         self.pca(adata, **self.pca_kwargs)
         temp_logger.finish_progress(progress_name="preprocess by monocle pearson residual recipe")

diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py
@@ -1540,18 +1540,19 @@ def regress_out_parallel(
     layer: str = DKM.X_LAYER,
     obs_key: Optional[List[str]] = None,
     gene_selection_key: Optional[str] = None,
+    n_cores: Optional[int] = None,
     obsm_store_key: str = "X_regress_out",
 ):
     """Perform linear regression to remove the effects of given variables from a target variable.
 
     Args:
-        adata: an AnnData object. Feature matrix of shape (n_samples, n_features)
+        adata: an AnnData object. Feature matrix of shape (n_samples, n_features).
         layer: the layer to regress out. Defaults to "X".
-        obs_key: List of observation keys to be removed
+        obs_key: List of observation keys to be removed.
         gene_selection_key: the key in adata.var that contains boolean for showing genes` filtering results.
             For example, "use_for_pca" is selected then it will regress out only for the genes that are True
             for "use_for_pca". This input will decrease processing time of regressing out data.
-        n_cores: Change this to the number of cores on your system for parallel computing.
+        n_cores: Change this to the number of cores on your system for parallel computing. Default to be None.
         obsm_store_key: the key to store the regress out result. Defaults to "X_regress_out".
     """
 
@@ -1571,13 +1572,9 @@ def regress_out_parallel(
         subset_adata = adata[:, adata.var.loc[:, gene_selection_key]]
         x_prev = DKM.select_layer_data(subset_adata, layer)
 
-    x_prev = x_prev.toarray().copy()
-    import timeit
-
-    stime = timeit.default_timer()
-
+    x_prev = x_prev.toarray()
     if x_prev.shape[1] < 10000:
-        y_new = x_prev  # for the cases when variables are more than 1.
+        y_new = x_prev.copy()  # for the cases when variables are more than 1.
 
         for key in obs_key:
             if key not in adata.obs.keys():
@@ -1596,11 +1593,14 @@ def regress_out_parallel(
         # Predict the residuals after removing the effects of the variables and save to "X_regress_out"
         res = regress_out_chunk(x_prev, y_new)
     else:
+        if n_cores is None:
+            import os
+
+            n_cores = os.cpu_count() / 2  # Use half of available cores as the default.
+
         import multiprocessing
-        import os
 
         ctx = multiprocessing.get_context("fork")
-        n_cores = os.cpu_count()
 
         regress_val = np.zeros((x_prev.shape[0], x_prev.shape[1]))
         # Split the input data into chunks for parallel processing
@@ -1635,7 +1635,6 @@ def regress_out_parallel(
 
         res = np.hstack(results)
 
-    main_info("regress out: keys(%s), time(%f)" % (obs_key, timeit.default_timer() - stime))
     adata.obsm[obsm_store_key] = res
 
 
@@ -1645,11 +1644,11 @@ def regress_out_chunk(
     """Perform a linear regression to remove the effects of cell features (percentage of mitochondria, etc.)
 
     Args:
-        obs_feature: the training dataset to be regressed out from the target.
-        gene_expr : List of observation keys used to regress out their effect to gene expression
+        obs_feature: list of observation keys used to regress out their effect to gene expression.
+        gene_expr : the current gene expression values of the target variables.
 
     Returns:
-        numpy array: Predict the effects of the variables to remove
+        numpy array: the residuals that are predicted the effects of the variables.
     """
     from sklearn.linear_model import LinearRegression
 

diff --git a/dynamo/preprocessing/utils.py b/dynamo/preprocessing/utils.py
@@ -14,8 +14,8 @@
 import scipy.sparse
 import statsmodels.api as sm
 from anndata import AnnData
-from scipy.sparse.linalg import LinearOperator, svds
 from scipy.sparse import csc_matrix, csr_matrix, issparse
+from scipy.sparse.linalg import LinearOperator, svds
 from sklearn.decomposition import PCA, TruncatedSVD
 from sklearn.utils import check_random_state
 from sklearn.utils.extmath import svd_flip
@@ -778,6 +778,7 @@ def decode(adata: anndata.AnnData) -> None:
 # ---------------------------------------------------------------------------------------------------
 # pca
 
+
 def _truncatedSVD_with_center(
     X: Union[csc_matrix, csr_matrix],
     n_components: int = 30,
@@ -844,7 +845,7 @@ def rmatmat(x):
     )
 
     # Solve SVD without calculating individuals entries in LinearOperator.
-    U, Sigma, VT = svds(X_centered, solver='arpack', k=n_components, v0=v0)
+    U, Sigma, VT = svds(X_centered, solver="arpack", k=n_components, v0=v0)
     Sigma = Sigma[::-1]
     U, VT = svd_flip(U[:, ::-1], VT[::-1])
     X_transformed = U * Sigma
@@ -865,11 +866,11 @@ def rmatmat(x):
     )
     X_pca = result_dict["X_pca"]
     fit.components_ = result_dict["components_"]
-    fit.explained_variance_ratio_ = result_dict[
-        "explained_variance_ratio_"]
+    fit.explained_variance_ratio_ = result_dict["explained_variance_ratio_"]
 
     return fit, X_pca
 
+
 def _pca_fit(
     X: np.ndarray,
     pca_func: Callable,
@@ -893,7 +894,7 @@ class from sklearn.decomposition.
         A tuple containing two elements:
             - The fitted PCA object, which has a 'fit' and 'transform' method.
             - The transformed array X_pca of shape (n_samples, n_components).
-        """
+    """
     fit = pca_func(
         n_components=min(n_components, X.shape[1] - 1),
         **kwargs,
@@ -1007,6 +1008,7 @@ def pca(
 
     if use_incremental_PCA:
         from sklearn.decomposition import IncrementalPCA
+
         fit, X_pca = _pca_fit(
             X_data,
             pca_func=IncrementalPCA,
@@ -1034,24 +1036,19 @@ def pca(
             # data. It only performs SVD decomposition, which is the second part
             # in our _truncatedSVD_with_center function.
             fit, X_pca = _pca_fit(
-                X_data,
-                pca_func=TruncatedSVD,
-                n_components=n_pca_components + 1,
-                random_state=random_state
+                X_data, pca_func=TruncatedSVD, n_components=n_pca_components + 1, random_state=random_state
             )
             # first columns is related to the total UMI (or library size)
             X_pca = X_pca[:, 1:]
 
     adata.obsm[pca_key] = X_pca
     if use_incremental_PCA or adata.n_obs < use_truncated_SVD_threshold:
         adata.uns[pcs_key] = fit.components_.T
-        adata.uns[
-            "explained_variance_ratio_"] = fit.explained_variance_ratio_
+        adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_
     else:
         # first columns is related to the total UMI (or library size)
         adata.uns[pcs_key] = fit.components_.T[:, 1:]
-        adata.uns[
-            "explained_variance_ratio_"] = fit.explained_variance_ratio_[1:]
+        adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_[1:]
     adata.uns["pca_mean"] = fit.mean_ if hasattr(fit, "mean_") else None
 
     if return_all:

diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py
@@ -172,14 +172,14 @@ def test_pca():
     preprocessor = Preprocessor()
     preprocessor.preprocess_adata_seurat_wo_pca(adata)
     adata = dyn.pp.pca(adata, n_pca_components=30)
-    assert adata.obsm["X"].shape[1] == 30
+    assert adata.obsm["X_pca"].shape[1] == 30
     assert adata.uns["PCs"].shape[1] == 30
     assert adata.uns["explained_variance_ratio_"].shape[0] == 30
 
     X_filterd = adata.X[:, adata.var.use_for_pca.values].copy()
     pca = PCA(n_components=30, random_state=0)
     X_pca_sklearn = pca.fit_transform(X_filterd.toarray())
-    assert np.linalg.norm(X_pca_sklearn[:, :10] - adata.obsm["X"][:, :10]) < 1e-1
+    assert np.linalg.norm(X_pca_sklearn[:, :10] - adata.obsm["X_pca"][:, :10]) < 1e-1
     assert np.linalg.norm(pca.components_.T[:, :10] - adata.uns["PCs"][:, :10]) < 1e-1
     assert np.linalg.norm(pca.explained_variance_ratio_[:10] - adata.uns["explained_variance_ratio_"][:10]) < 1e-1
 
@@ -229,15 +229,19 @@ def test_is_nonnegative():
 
 
 def test_regress_out():
-    adata = dyn.sample_data.hematopoiesis_raw()
+    starttime = timeit.default_timer()
+    celltype_key = "Cell_type"
+    figsize = (10, 10)
+    adata = dyn.read("./data/zebrafish.h5ad")  # dyn.sample_data.hematopoiesis_raw()
     dyn.pl.basic_stats(adata)
     dyn.pl.highest_frac_genes(adata)
 
-    preprocessor = Preprocessor()
+    preprocessor = Preprocessor(regress_out_kwargs={"obs_key": ["nCounts", "pMito"]})
 
-    starttime = timeit.default_timer()
-    # preprocessor.preprocess_adata(adata, recipe="monocle", regress_out=["nCounts", "Dummy", "Test", "pMito"])
-    preprocessor.preprocess_adata(adata, recipe="seurat", regress_out=["nCounts", "pMito"])
+    preprocessor.preprocess_adata(adata, recipe="monocle")
+    dyn.tl.reduceDimension(adata, basis="pca")
+
+    dyn.pl.umap(adata, color=celltype_key, figsize=figsize)
     print("The preprocess_adata() time difference is :", timeit.default_timer() - starttime)