Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extra filter after pearson residuals normalization #665

Merged
merged 4 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion dynamo/preprocessing/Preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from .QC import basic_stats
from .QC import filter_cells_by_outliers as monocle_filter_cells_by_outliers
from .QC import filter_genes_by_outliers as monocle_filter_genes_by_outliers
from .QC import regress_out_parallel
from .QC import regress_out_parallel, filter_cells_by_highly_variable_genes
from .transform import Freeman_Tukey, log, log1p, log2
from .utils import (
_infer_labeling_experiment_type,
Expand All @@ -52,6 +52,8 @@ def __init__(
filter_cells_by_outliers_kwargs: Dict[str, Any] = {},
filter_genes_by_outliers_function: Callable = monocle_filter_genes_by_outliers,
filter_genes_by_outliers_kwargs: Dict[str, Any] = {},
filter_cells_by_highly_variable_genes_function: Callable = filter_cells_by_highly_variable_genes,
filter_cells_by_highly_variable_genes_kwargs: Dict[str, Any] = {},
normalize_by_cells_function: Callable = normalize,
normalize_by_cells_function_kwargs: Dict[str, Any] = {},
size_factor_function: Callable = calc_sz_factor,
Expand Down Expand Up @@ -88,6 +90,10 @@ def __init__(
filter_cells_by_outliers_kwargs: arguments that will be passed to filter_cells_by_outliers. Defaults to {}.
filter_genes_by_outliers_function: filter genes by thresholds. Defaults to monocle_filter_genes_by_outliers.
filter_genes_by_outliers_kwargs: arguments that will be passed to filter_genes_by_outliers. Defaults to {}.
filter_cells_by_highly_variable_genes_function: filter cells by highly variable genes. Defaults to
filter_cells_by_highly_variable_genes.
filter_cells_by_highly_variable_genes_kwargs: arguments that will be passed to
filter_cells_by_highly_variable_genes. Defaults to {}.
normalize_by_cells_function: function for performing cell-wise normalization. Defaults to
normalize_cell_expr_by_size_factors.
normalize_by_cells_function_kwargs: arguments that will be passed to normalize_by_cells_function. Defaults
Expand Down Expand Up @@ -117,6 +123,7 @@ def __init__(

self.filter_cells_by_outliers = filter_cells_by_outliers_function
self.filter_genes_by_outliers = filter_genes_by_outliers_function
self.filter_cells_by_highly_variable_genes = filter_cells_by_highly_variable_genes_function
self.normalize_by_cells = normalize_by_cells_function
self.calc_size_factor = size_factor_function
self.calc_new_to_total_ratio = calc_new_to_total_ratio
Expand All @@ -135,6 +142,7 @@ def __init__(

# kwargs pass to the functions above
self.filter_genes_by_outliers_kwargs = filter_genes_by_outliers_kwargs
self.filter_cells_by_highly_variable_genes_kwargs = filter_cells_by_highly_variable_genes_kwargs
self.normalize_by_cells_function_kwargs = normalize_by_cells_function_kwargs
self.filter_cells_by_outliers_kwargs = filter_cells_by_outliers_kwargs
self.size_factor_kwargs = size_factor_kwargs
Expand Down Expand Up @@ -283,6 +291,18 @@ def _filter_genes_by_outliers(self, adata: AnnData) -> None:
main_debug("gene filter kwargs:" + str(self.filter_genes_by_outliers_kwargs))
self.filter_genes_by_outliers(adata, **self.filter_genes_by_outliers_kwargs)

def _filter_cells_by_highly_variable_genes(self, adata: AnnData) -> None:
"""Select valid cells based on the method specified as the preprocessor's `filter_cells_by_highly_variable_genes`.

Args:
adata: an AnnData object.
"""

if self.filter_cells_by_highly_variable_genes:
main_debug("filtering cells by highly variable genes...")
main_debug("filter_cells_by_highly_variable_genes kwargs:" + str(self.filter_cells_by_highly_variable_genes_kwargs))
self.filter_cells_by_highly_variable_genes(adata, **self.filter_cells_by_highly_variable_genes_kwargs)

def _calc_size_factor(self, adata: AnnData) -> None:
"""Calculate the size factor of each cell based on method specified as the preprocessor's `calc_size_factor`.

Expand Down Expand Up @@ -692,6 +712,7 @@ def config_pearson_residuals_recipe(self, adata: AnnData) -> None:

self.filter_cells_by_outliers = None
self.filter_genes_by_outliers = None
self.filter_cells_by_highly_variable_genes_kwargs = {"keep_filtered": False}
self.normalize_by_cells = None
self.select_genes = select_genes_by_pearson_residuals
self.select_genes_kwargs = {"n_top_genes": 2000}
Expand Down Expand Up @@ -730,6 +751,7 @@ def preprocess_adata_pearson_residuals(
self._exclude_gene_list(adata)
self._force_gene_list(adata)

self._filter_cells_by_highly_variable_genes(adata)
self._normalize_selected_genes(adata)
if len(self.regress_out_kwargs["obs_keys"]) > 0:
self._regress_out(adata)
Expand All @@ -751,6 +773,7 @@ def config_monocle_pearson_residuals_recipe(self, adata: AnnData) -> None:
self.config_monocle_recipe(adata)
# self.filter_cells_by_outliers = None
# self.filter_genes_by_outliers = None
self.filter_cells_by_highly_variable_genes_kwargs = {"keep_filtered": False}
self.normalize_by_cells = normalize
self.select_genes = select_genes_by_pearson_residuals
self.select_genes_kwargs = {"n_top_genes": 2000}
Expand Down Expand Up @@ -791,6 +814,7 @@ def preprocess_adata_monocle_pearson_residuals(
self._calc_size_factor(adata)
self._normalize_by_cells(adata)
adata.X = X_copy
self._filter_cells_by_highly_variable_genes(adata)
self._normalize_selected_genes(adata)
if len(self.regress_out_kwargs["obs_keys"]) > 0:
self._regress_out(adata)
Expand Down
42 changes: 42 additions & 0 deletions dynamo/preprocessing/QC.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,48 @@ def filter_cells_by_outliers(
return adata


def filter_cells_by_highly_variable_genes(
adata: anndata.AnnData,
keep_filtered: bool = False,
select_genes_layer: str = DKM.X_LAYER,
high_var_genes_key: str = DKM.VAR_USE_FOR_PCA,
obs_store_key: str = "pass_basic_filter",
) -> anndata.AnnData:
"""Filter cells based on the expression of highly variable genes.

Args:
adata: an AnnData object.
keep_filtered: whether to keep cells that don't pass the filtering in the adata object.
select_genes_layer: the layer used for genes selection.
high_var_genes_key: the key in adata.var to store the highly variable genes.
obs_store_key: the key in adata.obs to store the filtered data.

Returns:
An updated AnnData object indicating the selection of cells for downstream analysis.
"""
if high_var_genes_key not in adata.var.keys():
raise ValueError(
"The key %s is not found in adata.var. Please run genes selection methods first."
% high_var_genes_key
)

if obs_store_key not in adata.obs.keys():
adata.obs[obs_store_key] = True

filter_bool = adata.var[high_var_genes_key].values
X = DKM.select_layer_data(adata, layer=select_genes_layer)
X = X[:, filter_bool].A if issparse(X) else X[:, filter_bool]
nan_columns_index = np.where(np.sum(X, axis=1) == 0)[0]

adata.obs[obs_store_key].iloc[nan_columns_index] = False

if not keep_filtered:
main_debug("inplace subsetting adata by filtered cells", indent_level=2)
adata._inplace_subset_obs(adata.obs[obs_store_key])

return adata


def filter_genes_by_outliers(
adata: anndata.AnnData,
filter_bool: Union[np.ndarray, None] = None,
Expand Down
2 changes: 2 additions & 0 deletions dynamo/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
basic_stats,
filter_genes_by_clusters,
filter_cells_by_outliers,
filter_cells_by_highly_variable_genes,
filter_genes_by_outliers,
filter_genes_by_pattern,
)
Expand Down Expand Up @@ -59,6 +60,7 @@
"recipe_velocyto",
"calc_Gini",
"filter_cells_by_outliers",
"filter_cells_by_highly_variable_genes",
"select_genes_monocle",
"select_genes_by_pearson_residuals",
"filter_genes",
Expand Down
Loading