merge master to head

aristoteleo · May 4, 2023 · a1effbc · a1effbc
2 parents 15d896d + 9a079f5
commit a1effbc
Show file tree

Hide file tree

Showing 61 changed files with 8,699 additions and 6,643 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@
 
 Inclusive model of expression dynamics with metabolic labeling based scRNA-seq / multiomics, vector field reconstruction, potential landscape mapping, differential geometry analyses, and most probably paths / *in silico* perturbation predictions.
 
-[Installation](https://dynamo-release.readthedocs.io/en/latest/ten_minutes_to_dynamo.html#how-to-install) - [Ten minutes to dynamo](https://dynamo-release.readthedocs.io/en/latest/ten_minutes_to_dynamo.html) - [Tutorials](https://dynamo-release.readthedocs.io/en/latest/notebooks/Differential_geometry.html) - [API](https://dynamo-release.readthedocs.io/en/latest/API.html) - [Citation](https://www.biorxiv.org/content/10.1101/696724v2) - [Theory](https://dynamo-release.readthedocs.io/en/latest/notebooks/Primer.html)
+[Installation](https://dynamo-release.readthedocs.io/en/latest/ten_minutes_to_dynamo.html#how-to-install) - [Ten minutes to dynamo](https://dynamo-release.readthedocs.io/en/latest/ten_minutes_to_dynamo.html) - [Tutorials](https://dynamo-release.readthedocs.io/en/latest/notebooks/Differential_geometry.html) - [API](https://dynamo-release.readthedocs.io/en/latest/API.html) - [Citation](https://www.sciencedirect.com/science/article/pii/S0092867421015774?via%3Dihub) - [Theory](https://dynamo-release.readthedocs.io/en/latest/notebooks/Primer.html)
 
 ![Dynamo](https://user-images.githubusercontent.com/7456281/152110270-7ee1b0ed-1205-495d-9d65-59c7984d2fa2.png)
 

diff --git a/build.sh b/build.sh
@@ -0,0 +1 @@
+$PYTHON setup.py install     # Python command to install the script.
diff --git a/conda_recipe/bld.bat b/conda_recipe/bld.bat
@@ -0,0 +1,2 @@
+"%PYTHON%" setup.py install
+if errorlevel 1 exit 1
diff --git a/conda_recipe/build.sh b/conda_recipe/build.sh
@@ -0,0 +1 @@
+$PYTHON setup.py install     # Python command to install the script.
diff --git a/conda_recipe/meta.yaml b/conda_recipe/meta.yaml
@@ -0,0 +1,57 @@
+package:
+  name: "dynamo-release"
+  version: "1.2.0"
+
+source:
+  git_rev: v1.2.0
+  git_url: https://github.com/aristoteleo/dynamo-release.git
+
+build:
+  preserve_egg_dir: True
+
+requirements:
+  host:
+    - python
+    - typing-extensions
+    - openpyxl
+    - get_version
+    - numpy 1.23.*
+    - markdown 3.3.*
+  build:
+    - python
+  runs:
+    - python
+    - setuptools
+    - numpy 1.23.*
+    - pandas>=1.3.5
+    - scipy>=1.4.1
+    - scikit-learn>=0.19.1
+    - anndata>=0.8.0
+    - matplotlib>=3.5.3
+    - nxviz==0.7.3
+    - get_version>=3.5.4
+    - PATSY
+    - statsmodels
+    - numba>=0.54.0
+    - seaborn>=0.9.0
+    - colorcet>=2.0.1
+    - tqdm
+    - python-igraph>=0.7.1
+    - cdlib>=0.2.6
+    - louvain==0.8.0
+    - pynndescent>=0.5.2
+    - pre-commit
+    - networkx
+    - typing-extensions
+    - openpyxl
+
+about:
+  home: https://github.com/aristoteleo/dynamo-release
+  license: MIT
+  license_familY: MIT
+  license_file: LICENSE
+  summary: "Inclusive model of expression dynamics with metabolic labeling based scRNA-seq / multiomics, vector field reconstruction, potential landscape mapping, differential geometry analyses, and most probably paths / in silico perturbation predictions."
+
+extra:
+  recipe-maintainers:
+    - Xiaojieqiu
diff --git a/dynamo/configuration.py b/dynamo/configuration.py
@@ -1,4 +1,5 @@
 import warnings
+from typing import List, Optional, Tuple, Union
 
 import colorcet
 import matplotlib
@@ -9,7 +10,7 @@
 from cycler import cycler
 from matplotlib import cm, colors, rcParams
 
-from .dynamo_logger import main_info, main_warning
+from .dynamo_logger import main_debug, main_info
 
 
 class DynamoAdataKeyManager:
@@ -62,16 +63,19 @@ def gen_layer_pearson_residual_key(layer: str):
         )
 
     def select_layer_data(adata: AnnData, layer: str, copy=False) -> pd.DataFrame:
-        """select layer data based on layer key. The default layer is X layer in adata.
-        For layer-like data such as X stored in adata.X (but not in adata.layers) and protein data specified by dynamo convention,
-        this utility provides an unified interface for selecting layer data with shape n_obs x n_var."""
+        """This utility provides a unified interface for selecting layer data.
+
+        The default layer is X layer in adata with shape n_obs x n_var. For protein data it selects adata.obsm["protein"]
+        as specified by dynamo convention (the number of proteins are generally less than detected genes `n_var`).
+        For other layer data, select data based on layer key with shape n_obs x n_var.
+        """
         if layer is None:
             layer = DynamoAdataKeyManager.X_LAYER
         res_data = None
         if layer == DynamoAdataKeyManager.X_LAYER:
             res_data = adata.X
         elif layer == DynamoAdataKeyManager.PROTEIN_LAYER:
-            res_data = adata.obsm["protein"]
+            res_data = adata.obsm["protein"] if "protein" in adata.obsm_keys() else None
         else:
             res_data = adata.layers[layer]
         if copy:
@@ -102,6 +106,8 @@ def check_if_layer_exist(adata: AnnData, layer: str) -> bool:
     def get_available_layer_keys(adata, layers="all", remove_pp_layers=True, include_protein=True):
         """Get the list of available layers' keys. If `layers` is set to all, return a list of all available layers; if `layers` is set to a list, then the intersetion of available layers and `layers` will be returned."""
         layer_keys = list(adata.layers.keys())
+        if layers is None: # layers=adata.uns["pp"]["experiment_layers"], in calc_sz_factor
+            layers = "X"
         if remove_pp_layers:
             layer_keys = [i for i in layer_keys if not i.startswith("X_")]
 
@@ -137,6 +143,61 @@ def allowed_X_layer_names():
     def init_uns_pp_namespace(adata: AnnData):
         adata.uns[DynamoAdataKeyManager.UNS_PP_KEY] = {}
 
+    def get_excluded_layers(
+        X_total_layers: bool = False,
+        splicing_total_layers: bool = False
+    ) -> List:
+        """Get a list of excluded layers based on the provided arguments.
+
+        When splicing_total_layers is False, the function normalize spliced and unspliced RNA separately using each
+        layer's size factors. When X_total_layers is False, the function normalize X (normally it corresponds to the
+        spliced RNA or total RNA for a conventional scRNA-seq or labeling scRNA-seq) using its own size factor.
+
+        Args:
+            X_total_layers: whether to also normalize adata.X by size factor from total RNA.
+            splicing_total_layers: whether to also normalize spliced / unspliced layers by size factor from total RNA.
+
+        Returns:
+            The list of layers to be excluded.
+        """
+        excluded_layers = []
+        if not X_total_layers:
+            excluded_layers.extend(["X"])
+        if not splicing_total_layers:
+            excluded_layers.extend(["spliced", "unspliced"])
+        return excluded_layers
+
+    def aggregate_layers_into_total(
+        _adata: AnnData,
+        layers: Union[str, List[str]] = "all",
+        total_layers: Optional[List[str]] = None,
+        extend_layers: bool = True,
+    ) -> Tuple[Optional[List[str]], Union[str, List[str]]]:
+        """Create a total layer in adata by aggregating multiple layers.
+
+        The size factor normalization function is able to calculate size factors from customized layers. Given list
+        of total_layers, this helper function will calculate a temporary `_total_` layer.
+
+        Args:
+            _adata: the Anndata object.
+            layers: the layer(s) to be normailized in the normailzation function.
+            total_layers: the layer(s) to sum up to get the total mRNA. For example, ["spliced", "unspliced"],
+                ["uu", "ul", "su", "sl"] or ["new", "old"], etc.
+            extend_layers: whether to extend the `_total_` layer to the list of layers.
+
+        Returns:
+            The tuple contains total layers and layers. Anndata object will be updated with `_total_` layer.
+        """
+        if not isinstance(total_layers, list):
+            total_layers = [total_layers]
+        if len(set(total_layers).difference(_adata.layers.keys())) == 0:
+            total = None
+            for t_key in total_layers:
+                total = _adata.layers[t_key] if total is None else total + _adata.layers[t_key]
+            _adata.layers["_total_"] = total
+            if extend_layers:
+                layers.extend(["_total_"])
+        return total_layers, layers
 
 # TODO discuss alias naming convention
 DKM = DynamoAdataKeyManager
@@ -786,5 +847,5 @@ def set_pub_style_mpltex():
 
 # initialize DynamoSaveConfig and DynamoVisConfig mode defaults
 DynamoAdataConfig.update_data_store_mode("full")
-main_info("setting visualization default mode in dynamo. Your customized matplotlib settings might be overritten.")
+main_debug("setting visualization default mode in dynamo. Your customized matplotlib settings might be overwritten.")
 DynamoVisConfig.set_default_mode()
diff --git a/dynamo/dynamo_logger.py b/dynamo/dynamo_logger.py
@@ -154,8 +154,8 @@ def error(self, message, indent_level=1, *args, **kwargs):
 
     def info_insert_adata(self, key, adata_attr="obsm", indent_level=1, *args, **kwargs):
         message = "<insert> %s to %s in AnnData Object." % (key, adata_attr)
-        message = format_logging_message(message, logging.INFO, indent_level=indent_level)
-        return self.logger.error(message, *args, **kwargs)
+        message = format_logging_message(message, logging.DEBUG, indent_level=indent_level)
+        return self.logger.debug(message, *args, **kwargs)
 
     def info_insert_adata_var(self, key, indent_level=1, *args, **kwargs):
         return self.info_insert_adata(self, key, adata_attr="var", indent_level=1, *args, **kwargs)
@@ -189,18 +189,17 @@ def report_progress(self, percent=None, count=None, total=None, progress_name=""
 
     def finish_progress(self, progress_name="", time_unit="s", indent_level=1):
         self.log_time()
-        self.report_progress(percent=100, progress_name=progress_name)
+        # self.report_progress(percent=100, progress_name=progress_name)
 
         saved_terminator = self.logger_stream_handler.terminator
         self.logger_stream_handler.terminator = ""
-        self.logger.info("\n")
         self.logger_stream_handler.flush()
         self.logger_stream_handler.terminator = saved_terminator
 
         if time_unit == "s":
-            self.info("[%s] finished [%.4fs]" % (progress_name, self.time_passed), indent_level=indent_level)
+            self.info("[%s] completed [%.4fs]" % (progress_name, self.time_passed), indent_level=indent_level)
         elif time_unit == "ms":
-            self.info("[%s] finished [%.4fms]" % (progress_name, self.time_passed * 1e3), indent_level=indent_level)
+            self.info("[%s] completed [%.4fms]" % (progress_name, self.time_passed * 1e3), indent_level=indent_level)
         else:
             raise NotImplementedError
         # self.logger.info("|")

diff --git a/dynamo/plot/preprocess.py b/dynamo/plot/preprocess.py
@@ -9,7 +9,7 @@
 from ..configuration import DynamoAdataKeyManager
 from ..dynamo_logger import main_warning
 from ..preprocessing import preprocess as pp
-from ..preprocessing.preprocess_monocle_utils import top_table
+from ..preprocessing.gene_selection import get_prediction_by_svr
 from ..preprocessing.utils import detect_experiment_datatype
 from ..tools.utils import get_mapper, update_dict
 from .utils import save_fig
@@ -649,47 +649,36 @@ def feature_genes(
     save_show_or_return: str = "show",
     save_kwargs: dict = {},
 ):
-    """Plot selected feature genes on top of the mean vs. dispersion scatterplot.
+    """Plot selected feature genes on top of the mean vs. dispersion scatter plot.
 
-    Parameters
-    ----------
+    Args:
         adata: :class:`~anndata.AnnData`
             AnnData object
         layer: `str` (default: `X`)
             The data from a particular layer (include X) used for making the feature gene plot.
         mode: None or `str` (default: `None`)
-            The method to select the feature genes (can be either `dispersion`, `gini` or `SVR`).
+            The method to select the feature genes (can be either `cv_dispersion`, `fano_dispersion` or `gini`).
         figsize: `string` (default: (4, 3))
             Figure size of each facet.
         save_show_or_return: {'show', 'save', 'return'} (default: `show`)
             Whether to save, show or return the figure.
         save_kwargs: `dict` (default: `{}`)
-            A dictionary that will passed to the save_fig function. By default it is an empty dictionary and the
+            A dictionary that will be passed to the save_fig function. By default, it is an empty dictionary and the
             save_fig function will use the {"path": None, "prefix": 'feature_genes', "dpi": None, "ext": 'pdf',
-            "transparent": True, "close": True, "verbose": True} as its parameters. Otherwise you can provide a
+            "transparent": True, "close": True, "verbose": True} as its parameters. Otherwise, you can provide a
             dictionary that properly modify those keys according to your needs.
 
-    Returns
-    -------
+    Returns:
         Nothing but plots the selected feature genes via the mean, CV plot.
     """
 
     import matplotlib.pyplot as plt
 
     mode = adata.uns["feature_selection"] if mode is None else mode
-
     layer = DynamoAdataKeyManager.get_available_layer_keys(adata, layer, include_protein=False)[0]
-
     uns_store_key = None
-    if mode == "dispersion":
-        uns_store_key = "dispFitInfo" if layer in ["raw", "X"] else layer + "_dispFitInfo"
 
-        table = top_table(adata, layer)
-        x_min, x_max = (
-            np.nanmin(table["mean_expression"]),
-            np.nanmax(table["mean_expression"]),
-        )
-    elif mode == "SVR":
+    if "_dispersion" in mode:  # "cv_dispersion", "fano_dispersion"
         prefix = "" if layer == "X" else layer + "_"
         uns_store_key = "velocyto_SVR" if layer == "raw" or layer == "X" else layer + "_velocyto_SVR"
 
@@ -709,11 +698,12 @@ def feature_genes(
     ordering_genes = adata.var["use_for_pca"] if "use_for_pca" in adata.var.columns else None
 
     mu_linspace = np.linspace(x_min, x_max, num=1000)
-    fit = (
-        adata.uns[uns_store_key]["disp_func"](mu_linspace)
-        if mode == "dispersion"
-        else adata.uns[uns_store_key]["SVR"](mu_linspace.reshape(-1, 1))
-    )
+    if "_dispersion" in mode:
+        mean = adata.uns[uns_store_key]["mean"]
+        cv = adata.uns[uns_store_key]["cv"]
+        svr_gamma = adata.uns[uns_store_key]["svr_gamma"]
+        fit, _ = get_prediction_by_svr(mean, cv, svr_gamma)
+        fit = fit(mu_linspace.reshape(-1, 1))
 
     plt.figure(figsize=figsize)
     plt.plot(mu_linspace, fit, alpha=0.4, color="r")
@@ -724,15 +714,7 @@ def feature_genes(
     )
 
     valid_disp_table = table.iloc[valid_ind, :]
-    if mode == "dispersion":
-        ax = plt.scatter(
-            valid_disp_table["mean_expression"],
-            valid_disp_table["dispersion_empirical"],
-            s=3,
-            alpha=1,
-            color="xkcd:red",
-        )
-    elif mode == "SVR":
+    if "_dispersion" in mode:
         ax = plt.scatter(
             valid_disp_table[prefix + "log_m"],
             valid_disp_table[prefix + "log_cv"],
@@ -743,15 +725,7 @@ def feature_genes(
 
     neg_disp_table = table.iloc[~valid_ind, :]
 
-    if mode == "dispersion":
-        ax = plt.scatter(
-            neg_disp_table["mean_expression"],
-            neg_disp_table["dispersion_empirical"],
-            s=3,
-            alpha=0.5,
-            color="xkcd:grey",
-        )
-    elif mode == "SVR":
+    if "_dispersion" in mode:
         ax = plt.scatter(
             neg_disp_table[prefix + "log_m"],
             neg_disp_table[prefix + "log_cv"],
@@ -760,9 +734,6 @@ def feature_genes(
             color="xkcd:grey",
         )
 
-    # plt.xlim((0, 100))
-    if mode == "dispersion":
-        plt.xscale("log")
     plt.yscale("log")
     plt.xlabel("Mean (log)")
     plt.ylabel("Dispersion (log)") if mode == "dispersion" else plt.ylabel("CV (log)")
@@ -1080,7 +1051,7 @@ def highest_frac_genes(
 
         else:
             main_warning(
-                "%s not in adata.var, ignoring the gene annotation key when plotting",
+                "%s not in adata.var, ignoring the gene annotation key when plotting" % gene_annotation_key,
                 indent_level=2,
             )
 

diff --git a/dynamo/prediction/perturbation.py b/dynamo/prediction/perturbation.py
@@ -11,12 +11,10 @@
 from ..vectorfield.scVectorField import KOVectorField, vector_field_function_knockout
 from ..vectorfield.vector_calculus import (
     jacobian,
-    rank_cell_groups,
-    rank_cells,
-    rank_genes,
     vecfld_from_adata,
     vector_transformation,
 )
+from ..vectorfield.rank_vf import rank_cell_groups, rank_cells, rank_genes
 from .utils import expr_to_pca, pca_to_expr, z_score, z_score_inv
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		$PYTHON setup.py install # Python command to install the script.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		"%PYTHON%" setup.py install
		if errorlevel 1 exit 1