Merge pull request #664 from Sichao25/pp

Debug the preprocessing of integer matrix input
aristoteleo · Mar 11, 2024 · 98f1eb5 · 98f1eb5
2 parents 6f52dc6 + 53c9f0e
commit 98f1eb5
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 8 deletions.
diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py
@@ -535,6 +535,7 @@ def select_genes_by_seurat_recipe(
     gene_names: Union[List[str], None] = None,
     var_filter_key: str = "pass_basic_filter",
     inplace: bool = False,
+    initial_dtype: Optional[type] = None,
 ) -> None:
     """A general function for feature genes selection.
 
@@ -556,11 +557,15 @@ def select_genes_by_seurat_recipe(
         var_filter_key: filter gene names based on the key defined in adata.var before gene selection. Defaults to
             "pass_basic_filter".
         inplace: when inplace is True, subset adata according to selected genes. Defaults to False.
+        initial_dtype: the data type when initializing a new array. Should be one of the float type.
 
     Raises:
         NotImplementedError: the recipe is invalid/unsupported.
     """
 
+    if initial_dtype is None:
+        initial_dtype = adata.X.dtype if adata.X.dtype == np.float32 or adata.X.dtype == np.float64 else np.float32
+
     pass_filter_genes = adata.var_names
     if gene_names:
         main_info("select genes on gene names from arguments <gene_names>")
@@ -584,8 +589,8 @@ def select_genes_by_seurat_recipe(
             chunk_size=chunk_size,
             chunk_mode="gene",
         )
-        mean = np.zeros(len(pass_filter_genes), dtype=adata.X.dtype)
-        variance = np.zeros(len(pass_filter_genes), dtype=adata.X.dtype)
+        mean = np.zeros(len(pass_filter_genes), dtype=initial_dtype)
+        variance = np.zeros(len(pass_filter_genes), dtype=initial_dtype)
 
         for mat_data in chunked_layer_mats:
             layer_mat = mat_data[0]

diff --git a/dynamo/preprocessing/normalization.py b/dynamo/preprocessing/normalization.py
@@ -38,6 +38,7 @@ def calc_sz_factor(
     scale_to: Union[float, None] = None,
     use_all_genes_cells: bool = True,
     genes_use_for_norm: Union[List[str], None] = None,
+    initial_dtype: Optional[type] = None,
 ) -> anndata.AnnData:
     """Calculate the size factor of each cell using geometric mean or median of total UMI across cells for a AnnData
     object.
@@ -67,12 +68,16 @@ def calc_sz_factor(
         genes_use_for_norm: A list of gene names that will be used to calculate total RNA for each cell and then the
             size factor for normalization. This is often very useful when you want to use only the host genes to
             normalize the dataset in a virus infection experiment (i.e. CMV or SARS-CoV-2 infection). Defaults to None.
+        initial_dtype: the data type when initializing a new array. Should be one of the float type.
 
     Returns:
         An updated anndata object that are updated with the `Size_Factor` (`layer_` + `Size_Factor`) column(s) in the
         obs attribute.
     """
 
+    if initial_dtype is None:
+        initial_dtype = adata_ori.X.dtype if adata_ori.X.dtype == np.float32 or adata_ori.X.dtype == np.float64 else np.float32
+
     if use_all_genes_cells:
         # let us ignore the `inplace` parameter in pandas.Categorical.remove_unused_categories  warning.
         with warnings.catch_warnings():
@@ -122,6 +127,7 @@ def calc_sz_factor(
                 chunk_size=chunk_size,
                 total_layers=None,
                 scale_to=scale_to,
+                initial_dtype=initial_dtype,
             )
         else:
             sfs, cell_total = sz_util(
@@ -133,6 +139,7 @@ def calc_sz_factor(
                 chunk_size=chunk_size,
                 total_layers=total_layers,
                 scale_to=scale_to,
+                initial_dtype=initial_dtype,
             )
 
         sfs[~np.isfinite(sfs)] = 1
@@ -280,6 +287,11 @@ def normalize(
 
     layers = DKM.get_available_layer_keys(adata, layers)
 
+    if "X" in layers and transform_int_to_float and adata.X.dtype == "int":
+        main_warning("Transforming adata.X from int to float32 for normalization. If you want to disable this, set "
+                     "`transform_int_to_float` to False.")
+        adata.X = adata.X.astype("float32")
+
     if recalc_sz:
         if "use_for_pca" in adata.var.columns and keep_filtered is False:
             adata = adata[:, adata.var.loc[:, "use_for_pca"]]
@@ -304,11 +316,6 @@ def normalize(
         splicing_total_layers=splicing_total_layers,
     )
 
-    if "X" in layers and transform_int_to_float and adata.X.dtype == "int":
-        main_warning("Transforming adata.X from int to float32 for normalization. If you want to disable this, set "
-                     "`transform_int_to_float` to False.")
-        adata.X = adata.X.astype("float32")
-
     main_debug("size factor normalize following layers: " + str(layers))
     for layer in layers:
         if layer in excluded_layers:
@@ -432,6 +439,7 @@ def sz_util(
     total_layers: List[str] = None,
     CM: pd.DataFrame = None,
     scale_to: Union[float, None] = None,
+    initial_dtype: type=np.float32,
 ) -> Tuple[pd.Series, pd.Series]:
     """Calculate the size factor for a given layer.
 
@@ -450,6 +458,7 @@ def sz_util(
             ["uu", "ul", "su", "sl"] or ["new", "old"], etc. Defaults to None.
         CM: the data to operate on, overriding the layer. Defaults to None.
         scale_to: the final total expression for each cell that will be scaled to. Defaults to None.
+        initial_dtype: the data type when initializing the cell_total.
 
     Raises:
         NotImplementedError: method is invalid.
@@ -469,7 +478,7 @@ def sz_util(
     chunk_size = chunk_size if chunk_size is not None else adata.n_obs
     chunked_CMs = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) if CM is None else CM
 
-    cell_total = np.zeros(adata.n_obs, dtype=adata.X.dtype)
+    cell_total = np.zeros(adata.n_obs, dtype=initial_dtype)
 
     for CM_data in chunked_CMs:
         CM = CM_data[0]