diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index 2829486d4..614d56131 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -30,7 +30,7 @@ from .QC import basic_stats from .QC import filter_cells_by_outliers as monocle_filter_cells_by_outliers from .QC import filter_genes_by_outliers as monocle_filter_genes_by_outliers -from .QC import regress_out_parallel +from .QC import regress_out_parallel, filter_cells_by_highly_variable_genes from .transform import Freeman_Tukey, log, log1p, log2 from .utils import ( _infer_labeling_experiment_type, @@ -52,6 +52,8 @@ def __init__( filter_cells_by_outliers_kwargs: Dict[str, Any] = {}, filter_genes_by_outliers_function: Callable = monocle_filter_genes_by_outliers, filter_genes_by_outliers_kwargs: Dict[str, Any] = {}, + filter_cells_by_highly_variable_genes_function: Callable = filter_cells_by_highly_variable_genes, + filter_cells_by_highly_variable_genes_kwargs: Dict[str, Any] = {}, normalize_by_cells_function: Callable = normalize, normalize_by_cells_function_kwargs: Dict[str, Any] = {}, size_factor_function: Callable = calc_sz_factor, @@ -88,6 +90,10 @@ def __init__( filter_cells_by_outliers_kwargs: arguments that will be passed to filter_cells_by_outliers. Defaults to {}. filter_genes_by_outliers_function: filter genes by thresholds. Defaults to monocle_filter_genes_by_outliers. filter_genes_by_outliers_kwargs: arguments that will be passed to filter_genes_by_outliers. Defaults to {}. + filter_cells_by_highly_variable_genes_function: filter cells by highly variable genes. Defaults to + filter_cells_by_highly_variable_genes. + filter_cells_by_highly_variable_genes_kwargs: arguments that will be passed to + filter_cells_by_highly_variable_genes. Defaults to {}. normalize_by_cells_function: function for performing cell-wise normalization. Defaults to normalize_cell_expr_by_size_factors. normalize_by_cells_function_kwargs: arguments that will be passed to normalize_by_cells_function. Defaults @@ -117,6 +123,7 @@ def __init__( self.filter_cells_by_outliers = filter_cells_by_outliers_function self.filter_genes_by_outliers = filter_genes_by_outliers_function + self.filter_cells_by_highly_variable_genes = filter_cells_by_highly_variable_genes_function self.normalize_by_cells = normalize_by_cells_function self.calc_size_factor = size_factor_function self.calc_new_to_total_ratio = calc_new_to_total_ratio @@ -135,6 +142,7 @@ def __init__( # kwargs pass to the functions above self.filter_genes_by_outliers_kwargs = filter_genes_by_outliers_kwargs + self.filter_cells_by_highly_variable_genes_kwargs = filter_cells_by_highly_variable_genes_kwargs self.normalize_by_cells_function_kwargs = normalize_by_cells_function_kwargs self.filter_cells_by_outliers_kwargs = filter_cells_by_outliers_kwargs self.size_factor_kwargs = size_factor_kwargs @@ -283,6 +291,18 @@ def _filter_genes_by_outliers(self, adata: AnnData) -> None: main_debug("gene filter kwargs:" + str(self.filter_genes_by_outliers_kwargs)) self.filter_genes_by_outliers(adata, **self.filter_genes_by_outliers_kwargs) + def _filter_cells_by_highly_variable_genes(self, adata: AnnData) -> None: + """Select valid cells based on the method specified as the preprocessor's `filter_cells_by_highly_variable_genes`. + + Args: + adata: an AnnData object. + """ + + if self.filter_cells_by_highly_variable_genes: + main_debug("filtering cells by highly variable genes...") + main_debug("filter_cells_by_highly_variable_genes kwargs:" + str(self.filter_cells_by_highly_variable_genes_kwargs)) + self.filter_cells_by_highly_variable_genes(adata, **self.filter_cells_by_highly_variable_genes_kwargs) + def _calc_size_factor(self, adata: AnnData) -> None: """Calculate the size factor of each cell based on method specified as the preprocessor's `calc_size_factor`. @@ -692,6 +712,7 @@ def config_pearson_residuals_recipe(self, adata: AnnData) -> None: self.filter_cells_by_outliers = None self.filter_genes_by_outliers = None + self.filter_cells_by_highly_variable_genes_kwargs = {"keep_filtered": False} self.normalize_by_cells = None self.select_genes = select_genes_by_pearson_residuals self.select_genes_kwargs = {"n_top_genes": 2000} @@ -730,6 +751,7 @@ def preprocess_adata_pearson_residuals( self._exclude_gene_list(adata) self._force_gene_list(adata) + self._filter_cells_by_highly_variable_genes(adata) self._normalize_selected_genes(adata) if len(self.regress_out_kwargs["obs_keys"]) > 0: self._regress_out(adata) @@ -751,6 +773,7 @@ def config_monocle_pearson_residuals_recipe(self, adata: AnnData) -> None: self.config_monocle_recipe(adata) # self.filter_cells_by_outliers = None # self.filter_genes_by_outliers = None + self.filter_cells_by_highly_variable_genes_kwargs = {"keep_filtered": False} self.normalize_by_cells = normalize self.select_genes = select_genes_by_pearson_residuals self.select_genes_kwargs = {"n_top_genes": 2000} @@ -791,6 +814,7 @@ def preprocess_adata_monocle_pearson_residuals( self._calc_size_factor(adata) self._normalize_by_cells(adata) adata.X = X_copy + self._filter_cells_by_highly_variable_genes(adata) self._normalize_selected_genes(adata) if len(self.regress_out_kwargs["obs_keys"]) > 0: self._regress_out(adata) diff --git a/dynamo/preprocessing/QC.py b/dynamo/preprocessing/QC.py index f972de43e..118a7588d 100644 --- a/dynamo/preprocessing/QC.py +++ b/dynamo/preprocessing/QC.py @@ -270,6 +270,48 @@ def filter_cells_by_outliers( return adata +def filter_cells_by_highly_variable_genes( + adata: anndata.AnnData, + keep_filtered: bool = False, + select_genes_layer: str = DKM.X_LAYER, + high_var_genes_key: str = DKM.VAR_USE_FOR_PCA, + obs_store_key: str = "pass_basic_filter", +) -> anndata.AnnData: + """Filter cells based on the expression of highly variable genes. + + Args: + adata: an AnnData object. + keep_filtered: whether to keep cells that don't pass the filtering in the adata object. + select_genes_layer: the layer used for genes selection. + high_var_genes_key: the key in adata.var to store the highly variable genes. + obs_store_key: the key in adata.obs to store the filtered data. + + Returns: + An updated AnnData object indicating the selection of cells for downstream analysis. + """ + if high_var_genes_key not in adata.var.keys(): + raise ValueError( + "The key %s is not found in adata.var. Please run genes selection methods first." + % high_var_genes_key + ) + + if obs_store_key not in adata.obs.keys(): + adata.obs[obs_store_key] = True + + filter_bool = adata.var[high_var_genes_key].values + X = DKM.select_layer_data(adata, layer=select_genes_layer) + X = X[:, filter_bool].A if issparse(X) else X[:, filter_bool] + nan_columns_index = np.where(np.sum(X, axis=1) == 0)[0] + + adata.obs[obs_store_key].iloc[nan_columns_index] = False + + if not keep_filtered: + main_debug("inplace subsetting adata by filtered cells", indent_level=2) + adata._inplace_subset_obs(adata.obs[obs_store_key]) + + return adata + + def filter_genes_by_outliers( adata: anndata.AnnData, filter_bool: Union[np.ndarray, None] = None, diff --git a/dynamo/preprocessing/__init__.py b/dynamo/preprocessing/__init__.py index 22d95a23e..3e3365cc6 100755 --- a/dynamo/preprocessing/__init__.py +++ b/dynamo/preprocessing/__init__.py @@ -15,6 +15,7 @@ basic_stats, filter_genes_by_clusters, filter_cells_by_outliers, + filter_cells_by_highly_variable_genes, filter_genes_by_outliers, filter_genes_by_pattern, ) @@ -59,6 +60,7 @@ "recipe_velocyto", "calc_Gini", "filter_cells_by_outliers", + "filter_cells_by_highly_variable_genes", "select_genes_monocle", "select_genes_by_pearson_residuals", "filter_genes",