Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize pca #469

Merged
merged 18 commits into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions dynamo/preprocessing/Preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,31 @@ def _pca(self, adata: AnnData) -> None:
main_info("reducing dimension by PCA...")
self.pca(adata, **self.pca_kwargs)

def preprocess_adata_seurat_wo_pca(
self,
adata: AnnData,
tkey: Optional[str] = None,
experiment_type: Optional[str] = None
) -> None:
"""Preprocess the anndata object according to standard preprocessing in
Seurat recipe without PCA. This can be used to test different
dimentsion reduction methods.
"""
main_info("Running preprocessing pipeline...")
temp_logger = LoggerManager.gen_logger("preprocessor-seurat_wo_pca")
temp_logger.log_time()

self.standardize_adata(adata, tkey, experiment_type)
Sichao25 marked this conversation as resolved.
Show resolved Hide resolved
self.filter_cells_by_outliers(adata)
Sichao25 marked this conversation as resolved.
Show resolved Hide resolved
self._filter_genes_by_outliers(adata)
self._normalize_by_cells(adata)
self._select_genes(adata)
self._log1p(adata)

temp_logger.finish_progress(
progress_name="preprocess by seurat wo pca recipe"
)

def config_monocle_recipe(
self, adata: AnnData, n_top_genes: int = 2000, gene_selection_method: str = "SVR"
) -> None:
Expand Down
132 changes: 92 additions & 40 deletions dynamo/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -807,24 +807,35 @@ def _truncatedSVD_with_center(
random_state = check_random_state(random_state)
np.random.set_state(random_state.get_state())
v0 = random_state.uniform(-1, 1, np.min(X.shape))
n_components = min(n_components, X.shape[1] - 1)

mean = X.mean(0)
X_H = X.T.conj()
mean_H = mean.T.conj()
ones = np.ones(X.shape[0])[None, :].dot

# Following callables implements different type of matrix calculation.
def matvec(x):
"""Matrix-vector multiplication. Performs the operation X_centered*x
where x is a column vector or an 1-D array."""
return X.dot(x) - mean.dot(x)

def matmat(x):
"""Matrix-matrix multiplication. Performs the operation X_centered*x
where x is a matrix or ndarray."""
return X.dot(x) - mean.dot(x)

def rmatvec(x):
"""Adjoint matrix-vector multiplication. Performs the operation
X_centered^H * x where x is a column vector or an 1-d array."""
return X_H.dot(x) - mean_H.dot(ones(x))

def rmatmat(x):
"""Adjoint matrix-matrix multiplication. Performs the operation
X_centered^H * x where x is a matrix or ndarray."""
return X_H.dot(x) - mean_H.dot(ones(x))

# Construct the LinearOperator with callables above.
X_centered = LinearOperator(
shape=X.shape,
matvec=matvec,
Expand All @@ -834,6 +845,7 @@ def rmatmat(x):
dtype=X.dtype,
)

# Solve SVD without calculating individuals entries in LinearOperator.
U, Sigma, VT = svds(X_centered , k=n_components, v0=v0)
Sigma = Sigma[::-1]
U, VT = svd_flip(U[:, ::-1], VT[::-1])
Expand All @@ -849,7 +861,48 @@ def rmatmat(x):
"explained_variance_ratio_": exp_var / full_var,
}

return result_dict
fit = PCA(
n_components=n_components,
random_state=random_state,
)
X_pca = result_dict["X_pca"]
fit.components_ = result_dict["components_"]
fit.explained_variance_ratio_ = result_dict[
"explained_variance_ratio_"]

return fit, X_pca
Sichao25 marked this conversation as resolved.
Show resolved Hide resolved

def _pca_fit(
X: np.ndarray,
pca_func: Callable,
n_components: int = 30,
**kwargs,
) -> Tuple:
"""Apply PCA to the input data array X using the specified PCA function.

Args:
X: the input data array of shape (n_samples, n_features).
pca_func: the PCA function to use, which should have a 'fit' and
'transform' method, such as the PCA class or the IncrementalPCA
class from sklearn.decomposition.
n_components: the number of principal components to compute. If
n_components is greater than or equal to the number of features in
X, it will be set to n_features - 1 to avoid overfitting.
**kwargs: any additional keyword arguments that will be passed to the
PCA function.

Returns:
A tuple containing two elements:
- The fitted PCA object, which has a 'fit' and 'transform' method.
- The transformed array X_pca of shape (n_samples, n_components).
"""
fit = pca_func(
n_components=min(n_components, X.shape[1] - 1),
**kwargs,
).fit(X)
X_pca = fit.transform(X)
return fit, X_pca


def pca_monocle(
Sichao25 marked this conversation as resolved.
Show resolved Hide resolved
adata: AnnData,
Expand All @@ -859,10 +912,12 @@ def pca_monocle(
pcs_key: str = "PCs",
genes_to_append: Union[List[str], None] = None,
layer: Union[List[str], str, None] = None,
return_all: bool = False,
svd_solver: Literal["randomized", "arpack"] = "randomized",
random_state: int = 0,
Xiaojieqiu marked this conversation as resolved.
Show resolved Hide resolved
use_IPCA: bool = False,
IPCA_batch_size: Optional[int] = None,
use_truncated_SVD_threshold: int = 500000,
use_incremental_PCA: bool = False,
incremental_batch_size: Optional[int] = None,
return_all: bool = False,
) -> Union[AnnData, Tuple[AnnData, Union[PCA, TruncatedSVD], np.ndarray]]:
"""Perform PCA reduction for monocle recipe.
Sichao25 marked this conversation as resolved.
Show resolved Hide resolved

Expand All @@ -876,15 +931,17 @@ def pca_monocle(
genes_to_append: a list of genes should be inspected. Defaults to None.
layer: the layer(s) to perform dimension reduction on. Would be
overrided by X_data. Defaults to None.
return_all: whether to return the PCA fit model and the reduced array
together with the updated AnnData object.
Defaults to False.
svd_solver: the svd_solver to solve svd decomposition in PCA.
random_state: the seed used to initialize the random state for PCA.
use_IPCA: whether to use Incremental PCA. Recommend enabling IPCA when
dataset is too large to fit in memory.
IPCA_batch_size: The number of samples to use for each batch when
performing IPCA. If batch_size is None, then batch_size is inferred
from the data and set to 5 * n_features.
use_truncated_SVD_threshold: the threshold of observations to use
truncated SVD instead of standard PCA for efficiency.
use_incremental_PCA: whether to use Incremental PCA. Recommend enabling
incremental PCA when dataset is too large to fit in memory.
incremental_batch_size: The number of samples to use for each batch when
performing incremental PCA. If batch_size is None, then batch_size
is inferred from the data and set to 5 * n_features.
return_all: whether to return the PCA fit model and the reduced array
together with the updated AnnData object. Defaults to False.

Raises:
ValueError: layer provided is not invalid.
Expand Down Expand Up @@ -936,48 +993,43 @@ def pca_monocle(
adata.var.iloc[bad_genes, adata.var.columns.tolist().index("use_for_pca")] = False
X_data = X_data[:, valid_ind]

USE_TRUNCATED_SVD_THRESHOLD = 100000
if use_IPCA:
if use_incremental_PCA:
from sklearn.decomposition import IncrementalPCA
fit = IncrementalPCA(
n_components=min(n_pca_components, X_data.shape[1] - 1),
batch_size=IPCA_batch_size,
).fit(X_data)
X_pca = fit.transform(X_data)
fit, X_pca = _pca_fit(
X_data,
pca_func=IncrementalPCA,
n_components=n_pca_components,
batch_size=incremental_batch_size,
)
else:
if adata.n_obs < USE_TRUNCATED_SVD_THRESHOLD:
if adata.n_obs < use_truncated_SVD_threshold:
if not issparse(X_data):
fit = PCA(
n_components=min(n_pca_components, X_data.shape[1] - 1),
svd_solver="randomized",
random_state=random_state,
).fit(X_data)
X_pca = fit.transform(X_data)
else:
result_dict = _truncatedSVD_with_center(
fit, X_pca = _pca_fit(
X_data,
n_components=min(n_pca_components, X_data.shape[1] - 1),
pca_func=PCA,
n_components=n_pca_components,
svd_solver=svd_solver,
random_state=random_state,
)
fit = PCA(
n_components=min(n_pca_components, X_data.shape[1] - 1),
else:
fit, X_pca = _truncatedSVD_with_center(
X_data,
n_components=n_pca_components,
random_state=random_state,
)
X_pca = result_dict["X_pca"]
fit.components_ = result_dict["components_"]
fit.explained_variance_ratio_ = result_dict[
"explained_variance_ratio_"]
else:
# unscaled PCA
Sichao25 marked this conversation as resolved.
Show resolved Hide resolved
fit = TruncatedSVD(
n_components=min(n_pca_components + 1, X_data.shape[1] - 1),
random_state=random_state,
fit, X_pca = _pca_fit(
X_data,
pca_func=TruncatedSVD,
n_components=n_pca_components + 1,
random_state=random_state
)
# first columns is related to the total UMI (or library size)
X_pca = fit.fit_transform(X_data)[:, 1:]
X_pca = X_pca[:, 1:]

adata.obsm[pca_key] = X_pca
if use_IPCA or adata.n_obs < USE_TRUNCATED_SVD_THRESHOLD:
if use_incremental_PCA or adata.n_obs < use_truncated_SVD_threshold:
Sichao25 marked this conversation as resolved.
Show resolved Hide resolved
adata.uns[pcs_key] = fit.components_.T
adata.uns[
"explained_variance_ratio_"] = fit.explained_variance_ratio_
Expand Down
2 changes: 2 additions & 0 deletions tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ def test_compute_gene_exp_fraction():

def test_pca():
adata = dyn.sample_data.zebrafish()
preprocessor = Preprocessor()
preprocessor.preprocess_adata_seurat_wo_pca(adata)
adata = dyn.pp.pca_monocle(adata, n_pca_components=30)
Xiaojieqiu marked this conversation as resolved.
Show resolved Hide resolved
assert adata.obsm["X"].shape[1] == 30
assert adata.uns['PCs'].shape[1] == 30
Expand Down