Skip to content

Commit

Permalink
Merge pull request #665 from Sichao25/shiny
Browse files Browse the repository at this point in the history
Extra filter after pearson residuals normalization
  • Loading branch information
Xiaojieqiu authored Feb 26, 2024
2 parents 08d8c97 + 3c51be5 commit 8c09bc8
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 1 deletion.
26 changes: 25 additions & 1 deletion dynamo/preprocessing/Preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from .QC import basic_stats
from .QC import filter_cells_by_outliers as monocle_filter_cells_by_outliers
from .QC import filter_genes_by_outliers as monocle_filter_genes_by_outliers
from .QC import regress_out_parallel
from .QC import regress_out_parallel, filter_cells_by_highly_variable_genes
from .transform import Freeman_Tukey, log, log1p, log2
from .utils import (
_infer_labeling_experiment_type,
Expand All @@ -52,6 +52,8 @@ def __init__(
filter_cells_by_outliers_kwargs: Dict[str, Any] = {},
filter_genes_by_outliers_function: Callable = monocle_filter_genes_by_outliers,
filter_genes_by_outliers_kwargs: Dict[str, Any] = {},
filter_cells_by_highly_variable_genes_function: Callable = filter_cells_by_highly_variable_genes,
filter_cells_by_highly_variable_genes_kwargs: Dict[str, Any] = {},
normalize_by_cells_function: Callable = normalize,
normalize_by_cells_function_kwargs: Dict[str, Any] = {},
size_factor_function: Callable = calc_sz_factor,
Expand Down Expand Up @@ -88,6 +90,10 @@ def __init__(
filter_cells_by_outliers_kwargs: arguments that will be passed to filter_cells_by_outliers. Defaults to {}.
filter_genes_by_outliers_function: filter genes by thresholds. Defaults to monocle_filter_genes_by_outliers.
filter_genes_by_outliers_kwargs: arguments that will be passed to filter_genes_by_outliers. Defaults to {}.
filter_cells_by_highly_variable_genes_function: filter cells by highly variable genes. Defaults to
filter_cells_by_highly_variable_genes.
filter_cells_by_highly_variable_genes_kwargs: arguments that will be passed to
filter_cells_by_highly_variable_genes. Defaults to {}.
normalize_by_cells_function: function for performing cell-wise normalization. Defaults to
normalize_cell_expr_by_size_factors.
normalize_by_cells_function_kwargs: arguments that will be passed to normalize_by_cells_function. Defaults
Expand Down Expand Up @@ -117,6 +123,7 @@ def __init__(

self.filter_cells_by_outliers = filter_cells_by_outliers_function
self.filter_genes_by_outliers = filter_genes_by_outliers_function
self.filter_cells_by_highly_variable_genes = filter_cells_by_highly_variable_genes_function
self.normalize_by_cells = normalize_by_cells_function
self.calc_size_factor = size_factor_function
self.calc_new_to_total_ratio = calc_new_to_total_ratio
Expand All @@ -135,6 +142,7 @@ def __init__(

# kwargs pass to the functions above
self.filter_genes_by_outliers_kwargs = filter_genes_by_outliers_kwargs
self.filter_cells_by_highly_variable_genes_kwargs = filter_cells_by_highly_variable_genes_kwargs
self.normalize_by_cells_function_kwargs = normalize_by_cells_function_kwargs
self.filter_cells_by_outliers_kwargs = filter_cells_by_outliers_kwargs
self.size_factor_kwargs = size_factor_kwargs
Expand Down Expand Up @@ -283,6 +291,18 @@ def _filter_genes_by_outliers(self, adata: AnnData) -> None:
main_debug("gene filter kwargs:" + str(self.filter_genes_by_outliers_kwargs))
self.filter_genes_by_outliers(adata, **self.filter_genes_by_outliers_kwargs)

def _filter_cells_by_highly_variable_genes(self, adata: AnnData) -> None:
"""Select valid cells based on the method specified as the preprocessor's `filter_cells_by_highly_variable_genes`.
Args:
adata: an AnnData object.
"""

if self.filter_cells_by_highly_variable_genes:
main_debug("filtering cells by highly variable genes...")
main_debug("filter_cells_by_highly_variable_genes kwargs:" + str(self.filter_cells_by_highly_variable_genes_kwargs))
self.filter_cells_by_highly_variable_genes(adata, **self.filter_cells_by_highly_variable_genes_kwargs)

def _calc_size_factor(self, adata: AnnData) -> None:
"""Calculate the size factor of each cell based on method specified as the preprocessor's `calc_size_factor`.
Expand Down Expand Up @@ -692,6 +712,7 @@ def config_pearson_residuals_recipe(self, adata: AnnData) -> None:

self.filter_cells_by_outliers = None
self.filter_genes_by_outliers = None
self.filter_cells_by_highly_variable_genes_kwargs = {"keep_filtered": False}
self.normalize_by_cells = None
self.select_genes = select_genes_by_pearson_residuals
self.select_genes_kwargs = {"n_top_genes": 2000}
Expand Down Expand Up @@ -730,6 +751,7 @@ def preprocess_adata_pearson_residuals(
self._exclude_gene_list(adata)
self._force_gene_list(adata)

self._filter_cells_by_highly_variable_genes(adata)
self._normalize_selected_genes(adata)
if len(self.regress_out_kwargs["obs_keys"]) > 0:
self._regress_out(adata)
Expand All @@ -751,6 +773,7 @@ def config_monocle_pearson_residuals_recipe(self, adata: AnnData) -> None:
self.config_monocle_recipe(adata)
# self.filter_cells_by_outliers = None
# self.filter_genes_by_outliers = None
self.filter_cells_by_highly_variable_genes_kwargs = {"keep_filtered": False}
self.normalize_by_cells = normalize
self.select_genes = select_genes_by_pearson_residuals
self.select_genes_kwargs = {"n_top_genes": 2000}
Expand Down Expand Up @@ -791,6 +814,7 @@ def preprocess_adata_monocle_pearson_residuals(
self._calc_size_factor(adata)
self._normalize_by_cells(adata)
adata.X = X_copy
self._filter_cells_by_highly_variable_genes(adata)
self._normalize_selected_genes(adata)
if len(self.regress_out_kwargs["obs_keys"]) > 0:
self._regress_out(adata)
Expand Down
42 changes: 42 additions & 0 deletions dynamo/preprocessing/QC.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,48 @@ def filter_cells_by_outliers(
return adata


def filter_cells_by_highly_variable_genes(
adata: anndata.AnnData,
keep_filtered: bool = False,
select_genes_layer: str = DKM.X_LAYER,
high_var_genes_key: str = DKM.VAR_USE_FOR_PCA,
obs_store_key: str = "pass_basic_filter",
) -> anndata.AnnData:
"""Filter cells based on the expression of highly variable genes.
Args:
adata: an AnnData object.
keep_filtered: whether to keep cells that don't pass the filtering in the adata object.
select_genes_layer: the layer used for genes selection.
high_var_genes_key: the key in adata.var to store the highly variable genes.
obs_store_key: the key in adata.obs to store the filtered data.
Returns:
An updated AnnData object indicating the selection of cells for downstream analysis.
"""
if high_var_genes_key not in adata.var.keys():
raise ValueError(
"The key %s is not found in adata.var. Please run genes selection methods first."
% high_var_genes_key
)

if obs_store_key not in adata.obs.keys():
adata.obs[obs_store_key] = True

filter_bool = adata.var[high_var_genes_key].values
X = DKM.select_layer_data(adata, layer=select_genes_layer)
X = X[:, filter_bool].A if issparse(X) else X[:, filter_bool]
nan_columns_index = np.where(np.sum(X, axis=1) == 0)[0]

adata.obs[obs_store_key].iloc[nan_columns_index] = False

if not keep_filtered:
main_debug("inplace subsetting adata by filtered cells", indent_level=2)
adata._inplace_subset_obs(adata.obs[obs_store_key])

return adata


def filter_genes_by_outliers(
adata: anndata.AnnData,
filter_bool: Union[np.ndarray, None] = None,
Expand Down
2 changes: 2 additions & 0 deletions dynamo/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
basic_stats,
filter_genes_by_clusters,
filter_cells_by_outliers,
filter_cells_by_highly_variable_genes,
filter_genes_by_outliers,
filter_genes_by_pattern,
)
Expand Down Expand Up @@ -59,6 +60,7 @@
"recipe_velocyto",
"calc_Gini",
"filter_cells_by_outliers",
"filter_cells_by_highly_variable_genes",
"select_genes_monocle",
"select_genes_by_pearson_residuals",
"filter_genes",
Expand Down

0 comments on commit 8c09bc8

Please sign in to comment.