Skip to content

Commit

Permalink
Merge pull request #664 from Sichao25/pp
Browse files Browse the repository at this point in the history
Debug the preprocessing of integer matrix input
  • Loading branch information
Xiaojieqiu authored Mar 11, 2024
2 parents 6f52dc6 + 53c9f0e commit 98f1eb5
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 8 deletions.
9 changes: 7 additions & 2 deletions dynamo/preprocessing/gene_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,7 @@ def select_genes_by_seurat_recipe(
gene_names: Union[List[str], None] = None,
var_filter_key: str = "pass_basic_filter",
inplace: bool = False,
initial_dtype: Optional[type] = None,
) -> None:
"""A general function for feature genes selection.
Expand All @@ -556,11 +557,15 @@ def select_genes_by_seurat_recipe(
var_filter_key: filter gene names based on the key defined in adata.var before gene selection. Defaults to
"pass_basic_filter".
inplace: when inplace is True, subset adata according to selected genes. Defaults to False.
initial_dtype: the data type when initializing a new array. Should be one of the float type.
Raises:
NotImplementedError: the recipe is invalid/unsupported.
"""

if initial_dtype is None:
initial_dtype = adata.X.dtype if adata.X.dtype == np.float32 or adata.X.dtype == np.float64 else np.float32

pass_filter_genes = adata.var_names
if gene_names:
main_info("select genes on gene names from arguments <gene_names>")
Expand All @@ -584,8 +589,8 @@ def select_genes_by_seurat_recipe(
chunk_size=chunk_size,
chunk_mode="gene",
)
mean = np.zeros(len(pass_filter_genes), dtype=adata.X.dtype)
variance = np.zeros(len(pass_filter_genes), dtype=adata.X.dtype)
mean = np.zeros(len(pass_filter_genes), dtype=initial_dtype)
variance = np.zeros(len(pass_filter_genes), dtype=initial_dtype)

for mat_data in chunked_layer_mats:
layer_mat = mat_data[0]
Expand Down
21 changes: 15 additions & 6 deletions dynamo/preprocessing/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def calc_sz_factor(
scale_to: Union[float, None] = None,
use_all_genes_cells: bool = True,
genes_use_for_norm: Union[List[str], None] = None,
initial_dtype: Optional[type] = None,
) -> anndata.AnnData:
"""Calculate the size factor of each cell using geometric mean or median of total UMI across cells for a AnnData
object.
Expand Down Expand Up @@ -67,12 +68,16 @@ def calc_sz_factor(
genes_use_for_norm: A list of gene names that will be used to calculate total RNA for each cell and then the
size factor for normalization. This is often very useful when you want to use only the host genes to
normalize the dataset in a virus infection experiment (i.e. CMV or SARS-CoV-2 infection). Defaults to None.
initial_dtype: the data type when initializing a new array. Should be one of the float type.
Returns:
An updated anndata object that are updated with the `Size_Factor` (`layer_` + `Size_Factor`) column(s) in the
obs attribute.
"""

if initial_dtype is None:
initial_dtype = adata_ori.X.dtype if adata_ori.X.dtype == np.float32 or adata_ori.X.dtype == np.float64 else np.float32

if use_all_genes_cells:
# let us ignore the `inplace` parameter in pandas.Categorical.remove_unused_categories warning.
with warnings.catch_warnings():
Expand Down Expand Up @@ -122,6 +127,7 @@ def calc_sz_factor(
chunk_size=chunk_size,
total_layers=None,
scale_to=scale_to,
initial_dtype=initial_dtype,
)
else:
sfs, cell_total = sz_util(
Expand All @@ -133,6 +139,7 @@ def calc_sz_factor(
chunk_size=chunk_size,
total_layers=total_layers,
scale_to=scale_to,
initial_dtype=initial_dtype,
)

sfs[~np.isfinite(sfs)] = 1
Expand Down Expand Up @@ -280,6 +287,11 @@ def normalize(

layers = DKM.get_available_layer_keys(adata, layers)

if "X" in layers and transform_int_to_float and adata.X.dtype == "int":
main_warning("Transforming adata.X from int to float32 for normalization. If you want to disable this, set "
"`transform_int_to_float` to False.")
adata.X = adata.X.astype("float32")

if recalc_sz:
if "use_for_pca" in adata.var.columns and keep_filtered is False:
adata = adata[:, adata.var.loc[:, "use_for_pca"]]
Expand All @@ -304,11 +316,6 @@ def normalize(
splicing_total_layers=splicing_total_layers,
)

if "X" in layers and transform_int_to_float and adata.X.dtype == "int":
main_warning("Transforming adata.X from int to float32 for normalization. If you want to disable this, set "
"`transform_int_to_float` to False.")
adata.X = adata.X.astype("float32")

main_debug("size factor normalize following layers: " + str(layers))
for layer in layers:
if layer in excluded_layers:
Expand Down Expand Up @@ -432,6 +439,7 @@ def sz_util(
total_layers: List[str] = None,
CM: pd.DataFrame = None,
scale_to: Union[float, None] = None,
initial_dtype: type=np.float32,
) -> Tuple[pd.Series, pd.Series]:
"""Calculate the size factor for a given layer.
Expand All @@ -450,6 +458,7 @@ def sz_util(
["uu", "ul", "su", "sl"] or ["new", "old"], etc. Defaults to None.
CM: the data to operate on, overriding the layer. Defaults to None.
scale_to: the final total expression for each cell that will be scaled to. Defaults to None.
initial_dtype: the data type when initializing the cell_total.
Raises:
NotImplementedError: method is invalid.
Expand All @@ -469,7 +478,7 @@ def sz_util(
chunk_size = chunk_size if chunk_size is not None else adata.n_obs
chunked_CMs = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) if CM is None else CM

cell_total = np.zeros(adata.n_obs, dtype=adata.X.dtype)
cell_total = np.zeros(adata.n_obs, dtype=initial_dtype)

for CM_data in chunked_CMs:
CM = CM_data[0]
Expand Down

0 comments on commit 98f1eb5

Please sign in to comment.