Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix additional issues regarding regress out #483

Merged
merged 6 commits into from
May 3, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 28 additions & 9 deletions dynamo/preprocessing/Preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
select_genes_by_pearson_residuals,
)
from ..tools.connectivity import neighbors as default_neighbors
from ..tools.utils import update_dict
from .preprocess import normalize_cell_expr_by_size_factors_legacy, pca
from .preprocessor_utils import _infer_labeling_experiment_type
from .preprocessor_utils import (
Expand Down Expand Up @@ -485,7 +486,9 @@ def config_monocle_recipe(

# recipe monocle log1p all raw data in normalize_by_cells (dynamo version), so we do not need extra log1p transform.
self.use_log1p = False
self.regress_out_kwargs = {"obs_key": [], "gene_selection_key": "use_for_pca"}
self.regress_out_kwargs = update_dict(
{"obs_key": [], "gene_selection_key": "use_for_pca"}, self.regress_out_kwargs
)
self.pca = pca
self.pca_kwargs = {"pca_key": "X_pca"}

Expand Down Expand Up @@ -525,6 +528,7 @@ def preprocess_adata_monocle(

if len(self.regress_out_kwargs["obs_key"]) > 0:
self._regress_out(adata)
self.pca_kwargs.update({"layer": "regress_out"})

self._pca(adata)

Expand All @@ -545,7 +549,9 @@ def config_seurat_recipe(self, adata: AnnData, regress_out_obs_keys: List[str] =
self.filter_genes_by_outliers_kwargs = {"shared_count": 20}
self.use_log1p = True
self.log1p_kwargs = {"layers": ["X"]}
self.regress_out_kwargs = {"obs_key": [], "gene_selection_key": "use_for_pca"}
self.regress_out_kwargs = update_dict(
{"obs_key": [], "gene_selection_key": "use_for_pca"}, self.regress_out_kwargs
)

def preprocess_adata_seurat(
self, adata: AnnData, tkey: Optional[str] = None, experiment_type: Optional[str] = None
Expand Down Expand Up @@ -573,6 +579,7 @@ def preprocess_adata_seurat(
self._log1p(adata)
if len(self.regress_out_kwargs["obs_key"]) > 0:
self._regress_out(adata)
self.pca_kwargs.update({"layer": "regress_out"})
self._pca(adata)
temp_logger.finish_progress(progress_name="preprocess by seurat recipe")

Expand All @@ -595,6 +602,9 @@ def config_sctransform_recipe(self, adata: AnnData) -> None:
}
self.select_genes_kwargs = {"inplace": True}
self.sctransform_kwargs = {"layers": raw_layers, "n_top_genes": 2000}
self.regress_out_kwargs = update_dict(
{"obs_key": [], "gene_selection_key": "use_for_pca"}, self.regress_out_kwargs
)
self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50}

def preprocess_adata_sctransform(
Expand Down Expand Up @@ -624,6 +634,9 @@ def preprocess_adata_sctransform(
# TODO: if inplace in select_genes is True, the following subset is unnecessary.
adata._inplace_subset_var(adata.var["use_for_pca"])
self.sctransform(adata, **self.sctransform_kwargs)
if len(self.regress_out_kwargs["obs_key"]) > 0:
self._regress_out(adata)
self.pca_kwargs.update({"layer": "regress_out"})
self._pca(adata)

temp_logger.finish_progress(progress_name="preprocess by sctransform recipe")
Expand All @@ -644,6 +657,9 @@ def config_pearson_residuals_recipe(self, adata: AnnData) -> None:
# select layers in adata to be normalized
normalize_layers = DKM.get_raw_data_layers(adata)
self.normalize_selected_genes_kwargs = {"layers": normalize_layers, "copy": False}
self.regress_out_kwargs = update_dict(
{"obs_key": [], "gene_selection_key": "use_for_pca"}, self.regress_out_kwargs
)
self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50}
self.use_log1p = False

Expand All @@ -669,6 +685,10 @@ def preprocess_adata_pearson_residuals(

self._select_genes(adata)
self._normalize_selected_genes(adata)
if len(self.regress_out_kwargs["obs_key"]) > 0:
self._regress_out(adata)
self.pca_kwargs.update({"layer": "regress_out"})

self._pca(adata)

temp_logger.finish_progress(progress_name="preprocess by pearson residual recipe")
Expand All @@ -692,6 +712,9 @@ def config_monocle_pearson_residuals_recipe(self, adata: AnnData) -> None:
self.normalize_selected_genes = normalize_layers_pearson_residuals

self.normalize_selected_genes_kwargs = {"layers": ["X"], "copy": False}
self.regress_out_kwargs = update_dict(
{"obs_key": [], "gene_selection_key": "use_for_pca"}, self.regress_out_kwargs
)

self.pca_kwargs = {"pca_key": "X_pca", "n_pca_components": 50}
self.use_log1p = False
Expand Down Expand Up @@ -720,13 +743,9 @@ def preprocess_adata_monocle_pearson_residuals(
self._normalize_by_cells(adata)
adata.X = X_copy
self._normalize_selected_genes(adata)
# use monocle to pprocess adata
# self.config_monocle_recipe(adata_copy)
# self.pca = None # do not do pca in this monocle
# self.preprocess_adata_monocle(adata_copy)
# for layer in adata_copy.layers:
# if DKM.is_layer_X_key(layer):
# adata.layers[layer] = adata.
if len(self.regress_out_kwargs["obs_key"]) > 0:
self._regress_out(adata)
self.pca_kwargs.update({"layer": "regress_out"})

self.pca(adata, **self.pca_kwargs)
temp_logger.finish_progress(progress_name="preprocess by monocle pearson residual recipe")
Expand Down
29 changes: 14 additions & 15 deletions dynamo/preprocessing/preprocessor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1540,18 +1540,19 @@ def regress_out_parallel(
layer: str = DKM.X_LAYER,
obs_key: Optional[List[str]] = None,
gene_selection_key: Optional[str] = None,
n_cores: Optional[int] = None,
obsm_store_key: str = "X_regress_out",
):
"""Perform linear regression to remove the effects of given variables from a target variable.

Args:
adata: an AnnData object. Feature matrix of shape (n_samples, n_features)
adata: an AnnData object. Feature matrix of shape (n_samples, n_features).
layer: the layer to regress out. Defaults to "X".
obs_key: List of observation keys to be removed
obs_key: List of observation keys to be removed.
gene_selection_key: the key in adata.var that contains boolean for showing genes` filtering results.
For example, "use_for_pca" is selected then it will regress out only for the genes that are True
for "use_for_pca". This input will decrease processing time of regressing out data.
n_cores: Change this to the number of cores on your system for parallel computing.
n_cores: Change this to the number of cores on your system for parallel computing. Default to be None.
obsm_store_key: the key to store the regress out result. Defaults to "X_regress_out".
"""

Expand All @@ -1571,13 +1572,9 @@ def regress_out_parallel(
subset_adata = adata[:, adata.var.loc[:, gene_selection_key]]
x_prev = DKM.select_layer_data(subset_adata, layer)

x_prev = x_prev.toarray().copy()
import timeit

stime = timeit.default_timer()

x_prev = x_prev.toarray()
if x_prev.shape[1] < 10000:
y_new = x_prev # for the cases when variables are more than 1.
y_new = x_prev.copy() # for the cases when variables are more than 1.

for key in obs_key:
if key not in adata.obs.keys():
Expand All @@ -1596,11 +1593,14 @@ def regress_out_parallel(
# Predict the residuals after removing the effects of the variables and save to "X_regress_out"
res = regress_out_chunk(x_prev, y_new)
else:
if n_cores is None:
import os

n_cores = os.cpu_count() / 2 # Use half of available cores as the default.

import multiprocessing
import os

ctx = multiprocessing.get_context("fork")
n_cores = os.cpu_count()

regress_val = np.zeros((x_prev.shape[0], x_prev.shape[1]))
# Split the input data into chunks for parallel processing
Expand Down Expand Up @@ -1635,7 +1635,6 @@ def regress_out_parallel(

res = np.hstack(results)

main_info("regress out: keys(%s), time(%f)" % (obs_key, timeit.default_timer() - stime))
adata.obsm[obsm_store_key] = res


Expand All @@ -1645,11 +1644,11 @@ def regress_out_chunk(
"""Perform a linear regression to remove the effects of cell features (percentage of mitochondria, etc.)

Args:
obs_feature: the training dataset to be regressed out from the target.
gene_expr : List of observation keys used to regress out their effect to gene expression
obs_feature: list of observation keys used to regress out their effect to gene expression.
gene_expr : the current gene expression values of the target variables.

Returns:
numpy array: Predict the effects of the variables to remove
numpy array: the residuals that are predicted the effects of the variables.
"""
from sklearn.linear_model import LinearRegression

Expand Down
23 changes: 10 additions & 13 deletions dynamo/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
import scipy.sparse
import statsmodels.api as sm
from anndata import AnnData
from scipy.sparse.linalg import LinearOperator, svds
from scipy.sparse import csc_matrix, csr_matrix, issparse
from scipy.sparse.linalg import LinearOperator, svds
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.utils import check_random_state
from sklearn.utils.extmath import svd_flip
Expand Down Expand Up @@ -778,6 +778,7 @@ def decode(adata: anndata.AnnData) -> None:
# ---------------------------------------------------------------------------------------------------
# pca


def _truncatedSVD_with_center(
X: Union[csc_matrix, csr_matrix],
n_components: int = 30,
Expand Down Expand Up @@ -844,7 +845,7 @@ def rmatmat(x):
)

# Solve SVD without calculating individuals entries in LinearOperator.
U, Sigma, VT = svds(X_centered, solver='arpack', k=n_components, v0=v0)
U, Sigma, VT = svds(X_centered, solver="arpack", k=n_components, v0=v0)
Sigma = Sigma[::-1]
U, VT = svd_flip(U[:, ::-1], VT[::-1])
X_transformed = U * Sigma
Expand All @@ -865,11 +866,11 @@ def rmatmat(x):
)
X_pca = result_dict["X_pca"]
fit.components_ = result_dict["components_"]
fit.explained_variance_ratio_ = result_dict[
"explained_variance_ratio_"]
fit.explained_variance_ratio_ = result_dict["explained_variance_ratio_"]

return fit, X_pca


def _pca_fit(
X: np.ndarray,
pca_func: Callable,
Expand All @@ -893,7 +894,7 @@ class from sklearn.decomposition.
A tuple containing two elements:
- The fitted PCA object, which has a 'fit' and 'transform' method.
- The transformed array X_pca of shape (n_samples, n_components).
"""
"""
fit = pca_func(
n_components=min(n_components, X.shape[1] - 1),
**kwargs,
Expand Down Expand Up @@ -1007,6 +1008,7 @@ def pca(

if use_incremental_PCA:
from sklearn.decomposition import IncrementalPCA

fit, X_pca = _pca_fit(
X_data,
pca_func=IncrementalPCA,
Expand Down Expand Up @@ -1034,24 +1036,19 @@ def pca(
# data. It only performs SVD decomposition, which is the second part
# in our _truncatedSVD_with_center function.
fit, X_pca = _pca_fit(
X_data,
pca_func=TruncatedSVD,
n_components=n_pca_components + 1,
random_state=random_state
X_data, pca_func=TruncatedSVD, n_components=n_pca_components + 1, random_state=random_state
)
# first columns is related to the total UMI (or library size)
X_pca = X_pca[:, 1:]

adata.obsm[pca_key] = X_pca
if use_incremental_PCA or adata.n_obs < use_truncated_SVD_threshold:
adata.uns[pcs_key] = fit.components_.T
adata.uns[
"explained_variance_ratio_"] = fit.explained_variance_ratio_
adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_
else:
# first columns is related to the total UMI (or library size)
adata.uns[pcs_key] = fit.components_.T[:, 1:]
adata.uns[
"explained_variance_ratio_"] = fit.explained_variance_ratio_[1:]
adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_[1:]
adata.uns["pca_mean"] = fit.mean_ if hasattr(fit, "mean_") else None

if return_all:
Expand Down
18 changes: 11 additions & 7 deletions tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,14 @@ def test_pca():
preprocessor = Preprocessor()
preprocessor.preprocess_adata_seurat_wo_pca(adata)
adata = dyn.pp.pca(adata, n_pca_components=30)
assert adata.obsm["X"].shape[1] == 30
assert adata.obsm["X_pca"].shape[1] == 30
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you will need to update the pca_key in the pca function so that the default key is X_pca https://github.com/aristoteleo/dynamo-release/blob/master/dynamo/preprocessing/utils.py

def pca

assert adata.uns["PCs"].shape[1] == 30
assert adata.uns["explained_variance_ratio_"].shape[0] == 30

X_filterd = adata.X[:, adata.var.use_for_pca.values].copy()
pca = PCA(n_components=30, random_state=0)
X_pca_sklearn = pca.fit_transform(X_filterd.toarray())
assert np.linalg.norm(X_pca_sklearn[:, :10] - adata.obsm["X"][:, :10]) < 1e-1
assert np.linalg.norm(X_pca_sklearn[:, :10] - adata.obsm["X_pca"][:, :10]) < 1e-1
assert np.linalg.norm(pca.components_.T[:, :10] - adata.uns["PCs"][:, :10]) < 1e-1
assert np.linalg.norm(pca.explained_variance_ratio_[:10] - adata.uns["explained_variance_ratio_"][:10]) < 1e-1

Expand Down Expand Up @@ -229,15 +229,19 @@ def test_is_nonnegative():


def test_regress_out():
adata = dyn.sample_data.hematopoiesis_raw()
starttime = timeit.default_timer()
celltype_key = "Cell_type"
figsize = (10, 10)
adata = dyn.read("./data/zebrafish.h5ad") # dyn.sample_data.hematopoiesis_raw()
dyn.pl.basic_stats(adata)
dyn.pl.highest_frac_genes(adata)

preprocessor = Preprocessor()
preprocessor = Preprocessor(regress_out_kwargs={"obs_key": ["nCounts", "pMito"]})

starttime = timeit.default_timer()
# preprocessor.preprocess_adata(adata, recipe="monocle", regress_out=["nCounts", "Dummy", "Test", "pMito"])
preprocessor.preprocess_adata(adata, recipe="seurat", regress_out=["nCounts", "pMito"])
preprocessor.preprocess_adata(adata, recipe="monocle")
dyn.tl.reduceDimension(adata, basis="pca")

dyn.pl.umap(adata, color=celltype_key, figsize=figsize)
print("The preprocess_adata() time difference is :", timeit.default_timer() - starttime)


Expand Down