diff --git a/dynamo/prediction/perturbation.py b/dynamo/prediction/perturbation.py index 0ae9b5fcc..fed5b2f5c 100644 --- a/dynamo/prediction/perturbation.py +++ b/dynamo/prediction/perturbation.py @@ -11,12 +11,10 @@ from ..vectorfield.scVectorField import KOVectorField, vector_field_function_knockout from ..vectorfield.vector_calculus import ( jacobian, - rank_cell_groups, - rank_cells, - rank_genes, vecfld_from_adata, vector_transformation, ) +from ..vectorfield.rank_vf import rank_cell_groups, rank_cells, rank_genes from .utils import expr_to_pca, pca_to_expr, z_score, z_score_inv diff --git a/dynamo/vectorfield/Ao.py b/dynamo/vectorfield/Ao.py index 28c84b0f7..554c86549 100755 --- a/dynamo/vectorfield/Ao.py +++ b/dynamo/vectorfield/Ao.py @@ -1,20 +1,21 @@ +from typing import Callable, List, Optional, Tuple, Union + import numpy as np -import scipy.sparse as sp from scipy.optimize import least_squares -from tqdm import tqdm +import tqdm from ..tools.utils import condensed_idx_to_squareform_idx, squareform, timeit # from scPotential import show_landscape -def f_left(X, F): +def f_left(X: np.ndarray, F: np.ndarray) -> np.ndarray: """An auxiliary function for fast computation of F.X - (F.X)^T""" R = F.dot(X) return R - R.T -def f_left_jac(q, F): +def f_left_jac(q: np.ndarray, F: np.ndarray) -> np.ndarray: """ Analytical Jacobian of f(Q) = F.Q - (F.Q)^T, where Q is an anti-symmetric matrix s.t. Q^T = -Q. @@ -32,28 +33,27 @@ def f_left_jac(q, F): @timeit -def solveQ(D, F, q0=None, debug=False, precompute_jac=True, **kwargs): +def solveQ( + D: np.ndarray, + F: np.ndarray, + q0: Optional[np.ndarray] = None, + debug: bool = False, + precompute_jac: bool = True, + **kwargs +) -> Union[Tuple[np.ndarray, np.ndarray, np.ndarray, float], Tuple[np.ndarray, np.ndarray]]: """Function to solve for the anti-symmetric Q matrix in the equation: F.Q - (F.Q)^T = F.D - (F.D)^T using least squares. - Parameters - ---------- - D: :class:`~numpy.ndarray` - A symmetric diffusion matrix. - F: :class:`~numpy.ndarray` - Jacobian of the vector field function evaluated at a particular point. - debug: bool - Whether additional info of the solution is returned. - precompute_jac: bool - Whether the analytical Jacobian is precomputed for the optimizer. - - Returns - ------- - Q: :class:`~numpy.ndarray` - The solved anti-symmetric Q matrix. - C: :class:`~numpy.ndarray` - The right-hand side of the equation to be solved. + Args: + D: A symmetric diffusion matrix. + F: Jacobian of the vector field function evaluated at a particular point. + q0: Initial guess of the solution Q + debug: Whether additional info of the solution is returned. + precompute_jac: Whether the analytical Jacobian is precomputed for the optimizer. + + Returns: + The solved anti-symmetric Q matrix and the value of the right-hand side of the equation to be solved, optionally along with the value of the left-hand side of the equation and the cost of the solution if debug is True. """ n = D.shape[0] @@ -75,35 +75,27 @@ def solveQ(D, F, q0=None, debug=False, precompute_jac=True, **kwargs): return Q, C -def Ao_pot_map(vecFunc, X, D=None, **kwargs): +def Ao_pot_map( + vecFunc: Callable, X: np.ndarray, fjac: Optional[Callable] = None, D: Optional[np.ndarray] = None, **kwargs +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, List, List, List]: """Mapping potential landscape with the algorithm developed by Ao method. References: Potential in stochastic differential equations: novel construction. Journal of physics A: mathematical and general, Ao Ping, 2004 - Parameters - ---------- - vecFunc: `function` - The vector field function - X: :class:`~numpy.ndarray` - A n_cell x n_dim matrix of coordinates where the potential function is evaluated. - D: None or :class:`~numpy.ndarray` - Diffusion matrix. It must be a square matrix with size corresponds to the number of columns (features) in the X matrix. - - Returns - ------- - X: :class:`~numpy.ndarray` - A matrix storing the x-coordinates on the two-dimesional grid. - U: :class:`~numpy.ndarray` - A matrix storing the potential value at each position. - P: :class:`~numpy.ndarray` - Steady state distribution or the Boltzmann-Gibbs distribution for the state variable. - vecMat: list - List velocity vector at each position from X. - S: list - List of constant symmetric and semi-positive matrix or friction matrix, corresponding to the divergence part, + Args: + vecFunc: The vector field function. + X: A (n_cell x n_dim) matrix of coordinates where the potential function is evaluated. + fjac: function that returns the Jacobian of the vector field function evaluated at a particular point. + D: Diffusion matrix. It must be a square matrix with size corresponds to the number of columns (features) in the X matrix. + + Returns: + X: A matrix storing the x-coordinates on the two-dimesional grid. + U: A matrix storing the potential value at each position. + P: Steady state distribution or the Boltzmann-Gibbs distribution for the state variable. + vecMat: List velocity vector at each position from X. + S: List of constant symmetric and semi-positive matrix or friction (dissipative) matrix, corresponding to the divergence part, at each position from X. - A: list - List of constant antisymmetric matrix or transverse matrix, corresponding to the curl part, at each position + A: List of constant antisymmetric matrix or transverse (non-dissipative) matrix, corresponding to the curl part, at each position from X. """ @@ -116,7 +108,7 @@ def Ao_pot_map(vecFunc, X, D=None, **kwargs): for i in range(nobs): X_s = X[i, :] - F = nda.Jacobian(vecFunc)(X_s) + F = nda.Jacobian(vecFunc)(X_s) if fjac is None else fjac(X_s) Q, _ = solveQ(D, F, **kwargs) H = np.linalg.inv(D + Q).dot(F) U[i] = -0.5 * X_s.dot(H).dot(X_s) @@ -132,7 +124,6 @@ def Ao_pot_map(vecFunc, X, D=None, **kwargs): return X, U, P, vecMat, S, A - def Ao_pot_map_jac(fjac, X, D=None, **kwargs): nobs, ndim = X.shape if D is None: @@ -150,4 +141,4 @@ def Ao_pot_map_jac(fjac, X, D=None, **kwargs): H = np.linalg.inv(D + Q).dot(F) U[i] = -0.5 * X_s.dot(H).dot(X_s) - return U.flatten() + return U.flatten() \ No newline at end of file diff --git a/dynamo/vectorfield/Bhattacharya.py b/dynamo/vectorfield/Bhattacharya.py index 42b9ee7be..ebb524786 100755 --- a/dynamo/vectorfield/Bhattacharya.py +++ b/dynamo/vectorfield/Bhattacharya.py @@ -1,53 +1,46 @@ +from typing import Callable, Tuple, Union + import numpy as np # from scipy.integrate import odeint from scipy.interpolate import griddata -def path_integral(VecFnc, x_lim, y_lim, xyGridSpacing, dt=1e-2, tol=1e-2, numTimeSteps=1400): +def path_integral( + VecFnc: Callable, + x_lim: np.ndarray, + y_lim: np.ndarray, + xyGridSpacing: Union[int, float], + dt: float = 1e-2, + tol: float = 1e-2, + numTimeSteps: int = 1400, +) -> Tuple[ + int, np.ndarray, np.ndarray, np.ndarray, int, int, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray +]: """A deterministic map of Waddington’s epigenetic landscape for cell fate specification Sudin Bhattacharya, Qiang Zhang and Melvin E. Andersen - Parameters - ---------- - VecFnc - x_lim: `list` - Lower or upper limit of x-axis. - y_lim: `list` - Lower or upper limit of y-axis - xyGridSpacing: `float` - Grid spacing for "starting points" for each "path" on the pot. surface - dt: `float` - Time step for the path integral. - tol: `float` (default: 1.0e-2) - Tolerance to test for convergence. - numTimeSteps: `int` - A high-enough number for convergence with given dt. - - Returns - ------- - numAttractors: `int` - Number of attractors identified by the path integral approach. - attractors_num_X_Y: `numpy.ndarray` - Attractor number and the corresponding x, y coordinates. - sepx_old_new_pathNum: `numpy.ndarray` - The IDs of the two attractors for each separaxis per row. - numPaths_att `numpy.ndarray` - Number of paths per attractor - numPaths: `int` - Total Number of paths for defined grid spacing. - numTimeSteps: `int` - A high-enough number for convergence with given dt. - pot_path: `numpy.ndarray` (dimension: numPaths x numTimeSteps) - Potential along the path. - path_tag: `numpy.ndarray` (dimension: numPaths x 1) - Tag for given path (to denote basin of attraction). - attractors_pot: `numpy.ndarray` - Potential value of each identified attractors by the path integral approach. - x_path: `numpy.ndarray` - x-coord. along path. - y_path: `numpy.ndarray` - y-coord. along path. + Args: + VecFnc: The function of vector field that takes in x, y and returns the velocity at that point. + x_lim: Lower or upper limit of x-axis. + y_lim: Lower or upper limit of y-axis + xyGridSpacing: Grid spacing for "starting points" for each "path" on the potential surface + dt: Time step for the path integral. + tol: Tolerance to test for convergence. + numTimeSteps: A high-enough number for convergence with given dt. + + Returns: + numAttractors: Number of attractors identified by the path integral approach. + attractors_num_X_Y: Attractor number and the corresponding x, y coordinates. + sepx_old_new_pathNum: The IDs of the two attractors for each separaxis per row. + numPaths_att: Number of paths per attractor + numPaths: Total Number of paths for defined grid spacing. + numTimeSteps: A high-enough number for convergence with given dt. + pot_path: Potential along the path. (dimension: numPaths x numTimeSteps) + path_tag: Tag for given path (to denote basin of attraction). (dimension: numPaths x 1) + attractors_pot: Potential value of each identified attractors by the path integral approach. + x_path: x-coord along path. + y_path: y-coord along path. """ # -- First, generate potential surface from deterministic rate equations – @@ -279,7 +272,7 @@ def path_integral(VecFnc, x_lim, y_lim, xyGridSpacing, dt=1e-2, tol=1e-2, numTim numPaths_att[tag - 1] = numPaths_att[tag - 1] + 1 # increment "path counter" - path_counter = path_counter + 1 + path_counter += 1 return ( attractors_num_X_Y, @@ -297,68 +290,54 @@ def path_integral(VecFnc, x_lim, y_lim, xyGridSpacing, dt=1e-2, tol=1e-2, numTim def alignment( - numPaths, - numTimeSteps, - pot_path, - path_tag, - attractors_pot, - x_path, - y_path, - grid=100, - interpolation_method="linear", + numPaths: int, + numTimeSteps: int, + pot_path: np.ndarray, + path_tag: np.ndarray, + attractors_pot: np.ndarray, + x_path: np.ndarray, + y_path: np.ndarray, + grid: int = 100, + interpolation_method: str = "linear", ): """Align potential values so all path-potentials end up at same global min and then generate potential surface with interpolation on a grid. - Parameters - ---------- - numPaths: `int` - Total Number of paths for defined grid spacing. - numTimeSteps: `int` - A high-enough number for convergence with given dt. - pot_path: `numpy.ndarray` (dimension: numPaths x numTimeSteps) - Potential along the path. - path_tag: `numpy.ndarray` (dimension: numPaths x 1) - Tag for given path (to denote basin of attraction). - attractors_pot: `numpy.ndarray` - Potential value of each identified attractors by the path integral approach. - x_path: `numpy.ndarray` - x-coord. along path. - y_path: `numpy.ndarray` - y-coord. along path. - grid: `int` - No. of grid lines in x- and y- directions - interpolation_method: `string` - Method of interpolation in griddata function. One of - - ``nearest`` - return the value at the data point closest to - the point of interpolation. See `NearestNDInterpolator` for - more details. - - ``linear`` - tessellate the input point set to n-dimensional - simplices, and interpolate linearly on each simplex. See - `LinearNDInterpolator` for more details. - - ``cubic`` (1-D) - return the value determined from a cubic - spline. - - ``cubic`` (2-D) - return the value determined from a - piecewise cubic, continuously differentiable (C1), and - approximately curvature-minimizing polynomial surface. See - `CloughTocher2DInterpolator` for more details. - - Returns - ------- - Xgrid: `numpy.ndarray` - x-coordinates of the Grid produced from the meshgrid function. - Ygrid: `numpy.ndarray` - y-coordinates of the Grid produced from the meshgrid function. - Zgrid: `numpy.ndarray` - z-coordinates or potential at each of the x/y coordinate. + Args: + numPaths: Total Number of paths for defined grid spacing. + numTimeSteps: A high-enough number for convergence with given dt. + pot_path: Potential along the path. (dimension: numPaths x numTimeSteps) + path_tag: Tag for given path (to denote basin of attraction). (dimension: numPaths x 1) + attractors_pot: Potential value of each identified attractors by the path integral approach. + x_path: x-coord. along path. + y_path: y-coord. along path. + grid: No. of grid lines in x- and y- directions + interpolation_method: Method of interpolation in griddata function. One of + + ``nearest`` + return the value at the data point closest to + the point of interpolation. See `NearestNDInterpolator` for + more details. + + ``linear`` + tessellate the input point set to n-dimensional + simplices, and interpolate linearly on each simplex. See + `LinearNDInterpolator` for more details. + + ``cubic`` (1-D) + return the value determined from a cubic + spline. + + ``cubic`` (2-D) + return the value determined from a + piecewise cubic, continuously differentiable (C1), and + approximately curvature-minimizing polynomial surface. See + `CloughTocher2DInterpolator` for more details. + + Returns: + Xgrid: x-coordinates of the Grid produced from the meshgrid function. + Ygrid: y-coordinates of the Grid produced from the meshgrid function. + Zgrid: z-coordinates or potential at each of the x/y coordinate. """ # -- need 1-D "lists" (vectors) to plot all x,y, Pot values along paths -- @@ -410,3 +389,6 @@ def alignment( # print('Ran surface grid-interpolation okay!\n') return Xgrid, Ygrid, Zgrid + + +# %% diff --git a/dynamo/vectorfield/FixedPoints.py b/dynamo/vectorfield/FixedPoints.py index ae395e3d1..885b38e6d 100644 --- a/dynamo/vectorfield/FixedPoints.py +++ b/dynamo/vectorfield/FixedPoints.py @@ -1,20 +1,38 @@ +from typing import Optional, Tuple + import numpy as np from scipy.linalg import eig class FixedPoints: - def __init__(self, X=None, J=None): + """The FixedPoints class stores a list of fixed points and their corresponding Jacobian matrices, + and provides methods for computing the eigenvalues of the Jacobian matrices, determining the + stability of the fixed points, and identifying saddle/stable fixed points.""" + + def __init__(self, X: Optional[np.ndarray] = None, J: Optional[np.ndarray] = None): + """This class represents a set of fixed points and their corresponding Jacobian matrices. + The fixed points and Jacobian matrices can be provided as arguments, or they can be added + to the object later using the add_fixed_points method. The eigvals attribute stores the + eigenvalues of the Jacobian matrices, which can be computed using the compute_eigvals method. + The is_stable and is_saddle methods can be used to determine the stability and saddle-point + status of the fixed points, respectively, and the get_fixed_point_types method returns a list of + integers indicating the stability of each fixed point (-1 for stable, 0 for saddle, and 1 for unstable). + + Args: + X: array of fixed points. Defaults to None. + J: array of associated jacobians. Defaults to None. + """ self.X = X if X is not None else [] self.J = J if J is not None else [] self.eigvals = [] - def get_X(self): + def get_X(self) -> np.ndarray: return np.array(self.X) - def get_J(self): + def get_J(self) -> np.ndarray: return np.array(self.J) - def add_fixed_points(self, X, J, tol_redundant=1e-4): + def add_fixed_points(self, X: np.ndarray, J: np.ndarray, tol_redundant: float = 1e-4) -> None: for i, x in enumerate(X): redundant = False if tol_redundant is not None and len(self.X) > 0: @@ -25,7 +43,7 @@ def add_fixed_points(self, X, J, tol_redundant=1e-4): self.X.append(x) self.J.append(J[i]) - def compute_eigvals(self): + def compute_eigvals(self) -> None: self.eigvals = [] for i in range(len(self.J)): if self.J[i] is None or np.isnan(self.J[i]).any(): @@ -34,7 +52,7 @@ def compute_eigvals(self): w, _ = eig(self.J[i]) self.eigvals.append(w) - def is_stable(self): + def is_stable(self) -> np.ndarray: if len(self.eigvals) != len(self.X): self.compute_eigvals() @@ -47,7 +65,7 @@ def is_stable(self): stable[i] = False return stable - def is_saddle(self): + def is_saddle(self) -> Tuple[np.ndarray, np.ndarray]: is_stable = self.is_stable() saddle = np.zeros(len(self.eigvals), dtype=bool) for i, w in enumerate(self.eigvals): @@ -58,7 +76,7 @@ def is_saddle(self): saddle[i] = True return saddle, is_stable - def get_fixed_point_types(self): + def get_fixed_point_types(self) -> np.ndarray: is_saddle, is_stable = self.is_saddle() # -1 -- stable, 0 -- saddle, 1 -- unstable ftype = np.ones(len(self.X)) diff --git a/dynamo/vectorfield/Wang.py b/dynamo/vectorfield/Wang.py index 421bfbc01..92e2682f2 100755 --- a/dynamo/vectorfield/Wang.py +++ b/dynamo/vectorfield/Wang.py @@ -1,29 +1,24 @@ +from typing import Callable, Tuple + import numpy as np from scipy import optimize +from scipy.optimize import OptimizeResult -def Wang_action(X_input, F, D, dim, N, lamada_=1): +def Wang_action(X_input: np.ndarray, F: Callable, D: float, dim: int, N: int, lamada_: float = 1) -> float: """Calculate action by path integral by Wang's method. Quantifying the Waddington landscape and biological paths for development and differentiation. Jin Wang, Kun Zhang, Li Xu, and Erkang Wang, PNAS, 2011 - Parameters - ---------- - X_input: `numpy.ndarray` - The initial guess of the least action path. Default is a straight line connecting the starting and end path. - F: `Function` - The reconstructed vector field function. This is assumed to be time-independent. - D: `float` - The diffusion constant. Note that this can be a space-dependent matrix. - dim: `int` - The feature numbers of the input data. - N: `int` - Number of waypoints along the least action path. - lamada_: `float` - Regularization parameter - - Returns - ------- + Args: + X_input: The initial guess of the least action path. Default is a straight line connecting the starting and end path. + F: The reconstructed vector field function. This is assumed to be time-independent. + D: The diffusion constant. Note that this can be a space-dependent matrix. + dim: The feature numbers of the input data. + N: Number of waypoints along the least action path. + lamada_: Regularization parameter + + Returns: The action function calculated by the Hamilton-Jacobian method. """ @@ -56,11 +51,10 @@ def V_jacobina(F, X): return V_jacobina(X) -def V(F, D, X): +def V(F: Callable, D: float, X: np.ndarray) -> np.ndarray: """Calculate V - Parameters - ---------- + Args: F: `Function` The reconstructed vector field function D: `float` @@ -68,8 +62,7 @@ def V(F, D, X): X: `nummpy.ndarray` The input coordinates corresponding to the cell states. - Returns - ------- + Returns: Returns V """ @@ -78,17 +71,8 @@ def V(F, D, X): return V -def delta_delta_l(X_input): - """Calculate delta_L - - Parameters - ---------- - X_input: `numpy.ndarray` - - Returns - ------- - Return delta_L - """ +def delta_delta_l(X_input) -> Tuple[np.ndarray, float]: + """Calculate delta_L""" delta = np.diff(X_input, 1, 1) delta_l = np.sqrt(np.sum(delta**2, 0)) @@ -96,26 +80,20 @@ def delta_delta_l(X_input): return delta, delta_l -def Wang_LAP(F, n_points, point_start, point_end, D=0.1, lambda_=1): +def Wang_LAP( + F: Callable, n_points: int, point_start: np.ndarray, point_end: np.ndarray, D: float = 0.1, lambda_: float = 1 +) -> OptimizeResult: """Calculating least action path based methods from Jin Wang and colleagues (http://www.pnas.org/cgi/doi/10.1073/pnas.1017017108) - Parameters - ---------- - F: `Function` - The reconstructed vector field function - n_points: 'int' - The number of points along the least action path. - point_start: 'np.ndarray' - The matrix for storing the coordinates (gene expression configuration) of the start point (initial cell state). - point_end: 'np.ndarray' - The matrix for storing the coordinates (gene expression configuration) of the end point (terminal cell state). - D: `float` - The diffusion constant. Note that this can be a space-dependent matrix. - lamada_: `float` - Regularization parameter + Args: + F: The reconstructed vector field function + n_points: The number of points along the least action path. + point_start: The matrix for storing the coordinates (gene expression configuration) of the start point (initial cell state). + point_end: The matrix for storing the coordinates (gene expression configuration) of the end point (terminal cell state). + D: The diffusion constant. Note that this can be a space-dependent matrix. + lamada_: Regularization parameter - Returns - ------- + Returns: The least action path and the action way of the inferred path. """ initpath = point_start.dot(np.ones((1, n_points + 1))) + (point_end - point_start).dot( @@ -133,7 +111,7 @@ def Wang_LAP(F, n_points, point_start, point_end, D=0.1, lambda_=1): return res -def transition_rate(X_input, F, D=0.1, lambda_=1): +def transition_rate(X_input: np.ndarray, F: Callable, D: float = 0.1, lambda_: float = 1) -> np.ndarray: """Calculate the rate to convert from one cell state to another cell state by taking the optimal path. In the small noise limit (D -> 0) the Wentzell-Freidlin theory states that the transition rate from one basin to @@ -145,19 +123,13 @@ def transition_rate(X_input, F, D=0.1, lambda_=1): reference [15]), which is expected to be on the order of 1 [12]. (Reference: Epigenetic state network approach for describing cell phenotypic transitions. Ping Wang, Chaoming Song, Hang Zhang, Zhanghan Wu, Xiao-Jun Tian and Jianhua Xing) - Parameters - ---------- - X_input: `numpy.ndarray` - The initial guess of the least action path. Default is a straight line connecting the starting and end path. - F: `Function` - The reconstructed vector field function - D: `float` - The diffusion constant. Note that this can be a space-dependent matrix. - lamada_: `float` - Regularization parameter + Args: + X_input: The initial guess of the least action path. Default is a straight line connecting the starting and end path. + F: The reconstructed vector field function + D: The diffusion constant. Note that this can be a space-dependent matrix. + lamada_: Regularization parameter - Returns - ------- + Returns: The transition to convert from one cell state to another. """ @@ -167,7 +139,7 @@ def transition_rate(X_input, F, D=0.1, lambda_=1): return r -def MFPT(X_input, F, D=0.1, lambda_=1): +def MFPT(X_input: np.ndarray, F: Callable, D: float = 0.1, lambda_: float = 1) -> float: """Calculate the MFPT (mean first passage time) to convert from one cell state to another cell state by taking the optimal path. The mean first-passage time (MFPT) defines an average timescale for a stochastic event to first occur. The MFPT maps @@ -175,19 +147,13 @@ def MFPT(X_input, F, D=0.1, lambda_=1): state. The inverse of the MFPT is an effective rate of the overall reaction. (reference: Mean First-Passage Times in Biology Nicholas F. Polizzi,a Michael J. Therien,b and David N. Beratan) - Parameters - ---------- - X_input: `numpy.ndarray` - The initial guess of the least action path. Default is a straight line connecting the starting and end path. - F: `Function` - The reconstructed vector field function - D: `float` - The diffusion constant. Note that this can be a space-dependent matrix. - lamada_: `float` - Regularization parameter + Args: + X_input: The initial guess of the least action path. Default is a straight line connecting the starting and end path. + F: The reconstructed vector field function + D: The diffusion constant. Note that this can be a space-dependent matrix. + lamada_: Regularization parameter - Returns - ------- + Returns: The transition to convert from one cell state to another. """ diff --git a/dynamo/vectorfield/__init__.py b/dynamo/vectorfield/__init__.py index 58b950f00..db1226e4e 100644 --- a/dynamo/vectorfield/__init__.py +++ b/dynamo/vectorfield/__init__.py @@ -8,6 +8,19 @@ # vector field clustering related: from .clustering import cluster_field, streamline_clusters from .networks import adj_list_to_matrix, build_network_per_cluster +from .rank_vf import ( + aggregateRegEffs, + rank_acceleration_genes, + rank_cells, + rank_curvature_genes, + rank_divergence_genes, + rank_expression_genes, + rank_genes, + rank_jacobian_genes, + rank_s_divergence_genes, + rank_sensitivity_genes, + rank_velocity_genes, +) # potential related from .scPotential import ( # , vector_field_function @@ -40,23 +53,12 @@ from .utils import get_jacobian, parse_int_df, vector_field_function from .vector_calculus import ( acceleration, - aggregateRegEffs, curl, curvature, divergence, hessian, jacobian, laplacian, - rank_acceleration_genes, - rank_cells, - rank_curvature_genes, - rank_divergence_genes, - rank_expression_genes, - rank_genes, - rank_jacobian_genes, - rank_s_divergence_genes, - rank_sensitivity_genes, - rank_velocity_genes, sensitivity, speed, torsion, diff --git a/dynamo/vectorfield/cell_vectors.py b/dynamo/vectorfield/cell_vectors.py index 6b68ddc93..4937f8996 100644 --- a/dynamo/vectorfield/cell_vectors.py +++ b/dynamo/vectorfield/cell_vectors.py @@ -1,11 +1,21 @@ +from typing import Dict + +from anndata import AnnData + from ..tools.cell_velocities import cell_velocities from .topography import VectorField from .vector_calculus import acceleration, curvature def cell_accelerations( - adata, vf_basis="pca", basis="umap", enforce=True, preserve_len=True, other_kernels_dict={}, **kwargs -): + adata: AnnData, + vf_basis: str = "pca", + basis: str = "umap", + enforce: bool = True, + preserve_len: bool = True, + other_kernels_dict: Dict = {}, + **kwargs +) -> None: """Compute RNA acceleration field via reconstructed vector field and project it to low dimensional embeddings. In classical physics, including fluidics and aerodynamics, velocity and acceleration vector fields are used as @@ -32,36 +42,27 @@ def cell_accelerations( provides such functionalities in dynamo, with vector field that changes over time, similar methods, for example, streakline, pathline, timeline, etc. can be used to visualize the evolution of single cell or cell populations. - Arguments - --------- - adata: :class:`~anndata.AnnData` - an Annodata object. - vf_basis: 'int' (optional, default `pca`) - The dictionary key that corresponds to the low dimensional embedding where the vector field function + Args: + adata: an Anndata object. + vf_basis: The dictionary key that corresponds to the low dimensional embedding where the vector field function reconstructed. - basis: 'int' (optional, default `umap`) - The dictionary key that corresponds to the reduced dimension in `.obsm` attribute. - enforce: `bool` (default: `False`) - Whether to enforce 1) redefining use_for_transition column in obs attribute; + basis: The dictionary key that corresponds to the reduced dimension in `.obsm` attribute. + enforce: Whether to enforce 1) redefining use_for_transition column in obs attribute; 2) recalculation of transition matrix. - preserve_len: `bool` (default: `True`) - Whether to preserve the length of high dimension vector length. When set to be True, the length of low + preserve_len: Whether to preserve the length of high dimension vector length. When set to be True, the length of low dimension projected vector will be proportionally scaled to that of the high dimensional vector. Note that - when preserve_len is set to be True, the acceleration field may seem to be messy (although the magnitude will - be reflected) while the trend of acceleration when `preserve_len` is `True` is more clearer but will lose + when `preserve_len` is set to be `True`, the acceleration field may seem to be messy (although the magnitude will + be reflected) while the trend of acceleration when `preserve_len` is `False` is clearer but will lose information of acceleration magnitude. This is because the acceleration is not directly related to the distance of cells in the low embedding space; thus the acceleration direction can be better preserved than the magnitude. On the other hand, velocity is more relevant to the distance in low embedding space, so preserving magnitude and direction of velocity vector in low dimension can be more easily achieved. - other_kernels_dict: `dict` (default: `{}`) - A dictionary of paramters that will be passed to the cosine/correlation kernel. + other_kernels_dict: A dictionary of paramters that will be passed to the cosine/correlation kernel. - Returns - ------- - adata: :class:`~anndata.AnnData` - Returns an updated `~anndata.AnnData` with transition_matrix and projected embedding of high dimension + Returns: + adata: Returns an updated `~anndata.AnnData` with transition_matrix and projected embedding of high dimension acceleration vectors in the existing embeddings of current cell state, calculated using either the Itô - kernel method (default) or the diffusion approximation or the method from (La Manno et al. 2018). + kernel method (default), the diffusion approximation, or the method from (La Manno et al. 2018). """ if "velocity_" + vf_basis not in adata.obsm.keys(): @@ -99,8 +100,14 @@ def cell_accelerations( def cell_curvatures( - adata, vf_basis="pca", basis="umap", enforce=True, preserve_len=True, other_kernels_dict={}, **kwargs -): + adata: AnnData, + vf_basis: str = "pca", + basis: str = "umap", + enforce: bool = True, + preserve_len: bool = True, + other_kernels_dict: Dict = {}, + **kwargs +) -> None: """Compute RNA curvature field via reconstructed vector field and project it to low dimensional embeddings. In classical physics, including fluidics and aerodynamics, velocity and acceleration vector fields are used as @@ -127,20 +134,14 @@ def cell_curvatures( provides such functionalities in dynamo, with vector field that changes over time, similar methods, for example, streakline, pathline, timeline, etc. can be used to visualize the evolution of single cell or cell populations. - Arguments - --------- - adata: :class:`~anndata.AnnData` - an Annodata object. - vf_basis: 'int' (optional, default `pca`) - The dictionary key that corresponds to the low dimensional embedding where the vector field function + Args: + adata: an AnnData object. + vf_basis: The dictionary key that corresponds to the low dimensional embedding where the vector field function reconstructed. - basis: 'int' (optional, default `umap`) - The dictionary key that corresponds to the reduced dimension in `.obsm` attribute. - enforce: `bool` (default: `False`) - Whether to enforce 1) redefining use_for_transition column in obs attribute; + basis: The dictionary key that corresponds to the reduced dimension in `.obsm` attribute. + enforce: Whether to enforce 1) redefining use_for_transition column in obs attribute; 2) recalculation of transition matrix. - preserve_len: `bool` (default: `True`) - Whether to preserve the length of high dimension vector length. When set to be True, the length of low + preserve_len: Whether to preserve the length of high dimension vector length. When set to be True, the length of low dimension projected vector will be proportionally scaled to that of the high dimensional vector. Note that when preserve_len is set to be True, the acceleration field may seem to be messy (although the magnitude will be reflected) while the trend of acceleration when `preserve_len` is `True` is more clearer but will lose @@ -148,13 +149,10 @@ def cell_curvatures( distance of cells in the low embedding space; thus the acceleration direction can be better preserved than the magnitude. On the other hand, velocity is more relevant to the distance in low embedding space, so preserving magnitude and direction of velocity vector in low dimension can be more easily achieved. - other_kernels_dict: `dict` (default: `{}`) - A dictionary of paramters that will be passed to the cosine/correlation kernel. + other_kernels_dict: A dictionary of paramters that will be passed to the cosine/correlation kernel. - Returns - ------- - adata: :class:`~anndata.AnnData` - Returns an updated `~anndata.AnnData` with transition_matrix and projected embedding of high dimension + Returns: + adata: Returns an updated `~anndata.AnnData` with transition_matrix and projected embedding of high dimension curvature vectors in the existing embeddings of current cell state, calculated using either the Itô kernel method (default) or the diffusion approximation or the method from (La Manno et al. 2018). """ diff --git a/dynamo/vectorfield/clustering.py b/dynamo/vectorfield/clustering.py index 4c93387ba..aac4e62df 100644 --- a/dynamo/vectorfield/clustering.py +++ b/dynamo/vectorfield/clustering.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import List, Optional, Union import numpy as np import pandas as pd @@ -21,18 +21,18 @@ def cluster_field( - adata, - basis="pca", - features=["speed", "potential", "divergence", "acceleration", "curvature", "curl"], - add_embedding_basis=True, - embedding_basis=None, - normalize=False, - method="leiden", - cores=1, - copy=False, - resolution=1.0, + adata: AnnData, + basis: str = "pca", + features: List = ["speed", "potential", "divergence", "acceleration", "curvature", "curl"], + add_embedding_basis: bool = True, + embedding_basis: Optional[str] = None, + normalize: bool = False, + method: str = "leiden", + cores: int = 1, + copy: bool = False, + resolution: float = 1.0, **kwargs, -): +) -> Optional[AnnData]: """Cluster cells based on vector field features. We would like to see whether the vector field can be used to better define cell state/types. This can be accessed @@ -48,37 +48,29 @@ def cluster_field( by the classical dynamic system methods, we can use some machine learning approaches that are based on extracting geometric features of streamline to "cluster vector field space" for define cell states/type. This requires calculating, potential (ordered pseudotime), speed, curliness, divergence, acceleration, curvature, etc. Thanks to - the fact that we can analytically calculate Jacobian matrix matrix, those quantities of the vector field function + the fact that we can analytically calculate the Jacobian matrix, those quantities of the vector field function can be conveniently and efficiently calculated. - Parameters - ---------- - adata: :class:`~anndata.AnnData`. - adata object that includes both newly synthesized and total gene expression of cells. Alternatively, - the object should include both unspliced and spliced gene expression of cells. - basis: `str` or None (default: `None`) - The space that will be used for calculating vector field features. Valid names includes, for example, `pca`, - `umap`, etc. - embedding_basis: `str` or None (default: `None`) - The embedding basis that will be combined with the vector field feature space for clustering. - normalize: `bool` (default: `False`) - Whether to mean center and scale the feature across all cells. - method: `str` (default: `leiden`) - The method that will be used for clustering, one of `{'kmeans'', 'hdbscan', 'louvain', 'leiden'}`. If `louvain` - or `leiden` used, you need to have `cdlib` installed. - cores: `int` (default: 1) - The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a - :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. - copy: - Whether to return a new deep copy of `adata` instead of updating `adata` object passed in arguments. - resolution: - Clustering resolution, higher values yield more fine-grained clusters. - kwargs: - Any additional arguments that will be passed to either kmeans, hdbscan, louvain or leiden clustering algorithms. - - Returns - ------- + Args: + adata: adata object that includes both newly synthesized and total gene expression of cells. Alternatively, + the object should include both unspliced and spliced gene expression of cells. + basis: The space that will be used for calculating vector field features. Valid names includes, for example, `pca`, + `umap`, etc. + features: features have to be selected from ['speed', 'potential', 'divergence', 'acceleration', 'curvature', 'curl'] + add_embedding_basis: Whether to add the embedding basis to the feature space for clustering. + embedding_basis: The embedding basis that will be combined with the vector field feature space for clustering. + normalize: Whether to mean center and scale the feature across all cells. + method: The method that will be used for clustering, one of `{'kmeans'', 'hdbscan', 'louvain', 'leiden'}`. If `louvain` + or `leiden` used, you need to have `cdlib` installed. + cores: The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a + :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. + copy: Whether to return a new deep copy of `adata` instead of updating `adata` object passed in arguments. + resolution: Clustering resolution, higher values yield more fine-grained clusters. + kwargs: Any additional arguments that will be passed to either kmeans, hdbscan, louvain or leiden clustering algorithms. + + Returns: + Either updates `adata` or directly returns a new `adata` object if `copy` is `True`. """ @@ -91,7 +83,7 @@ def cluster_field( ) if len(features) < 1: raise ValueError( - "features has to be selected from ['speed', 'potential', 'divergence', 'acceleration', " + "features have to be selected from ['speed', 'potential', 'divergence', 'acceleration', " f"'curvature', 'curl']. your feature is {features}" ) @@ -209,24 +201,43 @@ def streamline_clusters( assign_fixedpoints: bool = False, reversed_fixedpoints: bool = False, **kwargs, -): - """ - - Parameters - ---------- - adata - basis - features - method - xy_grid_nums - density - curvature_method - feature_bins - clustering_method - - Returns - ------- - +) -> None: + """Cluster 2D streamlines based on vector field features. Initialize a grid over the state space and compute the + flow of data through the grid using plt.streamplot with a given density. For each point individual streamline, + computes the vector field 'features' of interest and stores the data via histograms. Add fixed points and + "reversed fixed points" (sources of the streamlines) to the feature data dataframe based on the + 'assigned_fixedpoints' and 'reversed_fixedpoints' args. Finally, then cluster the streamlines based on these + features using the given 'clustering_method'. + + Args: + adata: An AnnData object representing the network to be analyzed. + basis: The basis to use for creating the vector field, either "umap" or "tsne". Defaults to "umap". + features: A list of features to calculate for each point in the vector field. Defaults to ["speed", "divergence", "acceleration", "curvature", "curl"]. + method: The method to use for calculating the flow of data through the grid, either "sparsevfc" or "gaussian". Defaults to "sparsevfc". + xy_grid_nums: The number of points to use in the x and y dimensions of the grid. Defaults to [50, 50]. + density: The density of the grid. Defaults to 5. + curvature_method: The method to use for calculating curvature. Defaults to 1. + feature_bins: The number of bins to use for discretizing the data. Defaults to 10. + clustering_method: The method to use for clustering the data into modules, either "louvain" or "leiden". Defaults to "leiden". + assign_fixedpoints: A boolean indicating whether to assign fixed points to the data. Defaults to False. + reversed_fixedpoints: A boolean indicating whether to reverse the fixed points assignment. Defaults to False. + + Raises: + ImportError: If the "cdlib" package is not installed and the "louvain" or "leiden" clustering method is specified. + ValueError: If an invalid method is specified for calculating the flow of data through the grid. + ValueError: If an invalid method is specified for clustering the data into modules. + + Returns: + None, but updates the `adata` object with the following fields of the `adata.uns["streamline_clusters_" + basis]` + - "feature_df" + - "segments" + - "X_pca" + - "clustering_method" + - "distances" + - "connectivities" + - "clusters" + - "fixed_point" + - "rev_fixed_point" """ import matplotlib.pyplot as plt @@ -338,7 +349,7 @@ def streamline_clusters( line_len.append(values.shape[0]) tmp = None if has_acc: - acceleration_val, acceleration_vec = vector_field_class.compute_acceleration(values) + acceleration_val, _ = vector_field_class.compute_acceleration(values) acc_dict[key] = acceleration_val _, acc_hist = np.histogram(acceleration_val, bins=(bins - 1), density=True) diff --git a/dynamo/vectorfield/least_action_path.py b/dynamo/vectorfield/least_action_path.py index 0ca3a097e..a1311aef0 100644 --- a/dynamo/vectorfield/least_action_path.py +++ b/dynamo/vectorfield/least_action_path.py @@ -1,8 +1,21 @@ +from typing import Callable, Optional, Tuple + import numpy as np -from scipy.optimize import minimize +from scipy.optimize import OptimizeResult, minimize + + +def action(path: np.ndarray, vf_func: Callable, D=1, dt=1) -> float: + """Compute the action of a path by taking the sum of the squared distance between the path and the vector field and dividing by twice the diffusion constant. Conceptually, the action represents deviations from the streamline of the vector field. Reference Box 3 in the publication for more information. + Args: + path: sequence of points in the state space collectively representing the path of interest + vf_func: function that takes a point in the state space and returns the vector field at that point + D: Diffusion constant, Defaults to 1. + dt: Time step for moving from one state to another within the path, Defaults to 1. -def action(path, vf_func, D=1, dt=1): + Returns: + the action of the path + """ # centers x = (path[:-1] + path[1:]) * 0.5 v = np.diff(path, axis=0) / dt @@ -18,7 +31,19 @@ def action_aux(path_flatten, vf_func, dim, start=None, end=None, **kwargs): return action(path, vf_func, **kwargs) -def action_grad(path, vf_func, jac_func, D=1, dt=1): +def action_grad(path: np.ndarray, vf_func: Callable, jac_func: Callable, D: float = 1, dt: float = 1) -> np.ndarray: + """Compute the gradient of the action with respect to each component of each point in the path using the analytical Jacobian. + + Args: + path: sequence of points in the state space collectively representing the path of interest + vf_func: function that takes a point in the state space and returns the vector field at that point + jac_func: function for computing Jacobian given cell state + D: Diffusion constant, Defaults to 1. + dt: Time step for moving from one state to another within the path, Defaults to 1. + + Returns: + gradient of the action with respect to each component of each point in the path + """ x = (path[:-1] + path[1:]) * 0.5 v = np.diff(path, axis=0) / dt @@ -45,7 +70,30 @@ def reshape_path(path_flatten, dim, start=None, end=None): return path -def least_action_path(start, end, vf_func, jac_func, n_points=20, init_path=None, D=1): +def least_action_path( + start: np.ndarray, + end: np.ndarray, + vf_func: Callable, + jac_func: Callable, + n_points: int = 20, + init_path: Optional[np.ndarray] = None, + D: int = 1, +) -> Tuple[np.ndarray, OptimizeResult]: + """Compute the least action path between two points using gradient descent optimization. + + Args: + start: The starting point for the path + end: The ending point for the path + vf_func: A function that returns the vector field of the system at a given point. + jac_func: A function that returns the Jacobian at a given point. + n_points: The number of intermediate points to use when initializing the path. Defaults to 20. + init_path: An optional initial path to use instead of the default initialization. Defaults to None. + D: The diffusion constant. Defaults to 1. + + Returns: + A tuple containing the optimized least action path and the optimization result. The least action path is a numpy array of shape (n_points + 2, D), where n_points is the number of intermediate points used in the initialization and D is the dimension of start and end. The optimization result is a scipy.optimize.OptimizeResult object containing information about the optimization process. + """ + dim = len(start) if init_path is None: path_0 = ( diff --git a/dynamo/vectorfield/networks.py b/dynamo/vectorfield/networks.py index 1bbdcdd8f..13a33af2d 100644 --- a/dynamo/vectorfield/networks.py +++ b/dynamo/vectorfield/networks.py @@ -1,35 +1,32 @@ +from typing import Dict, List, Optional, Union + +import networkx as nx import numpy as np import pandas as pd +from anndata import AnnData from ..dynamo_logger import main_debug, main_info, main_tqdm -from .vector_calculus import rank_jacobian_genes +from .rank_vf import rank_jacobian_genes def get_interaction_in_cluster( - rank_df_dict, - group, - genes, - n_top_genes=100, - rank_regulators=False, - negative_values=False, -): + rank_df_dict: Dict[str, pd.DataFrame], + group: str, + genes: List, + n_top_genes: int = 100, + rank_regulators: bool = False, + negative_values: bool = False, +) -> pd.DataFrame: """Retrieve interactions among input genes given the ranking dataframe. - Parameters - ---------- - rank_df_dict: `dict` of `pandas.DataFrame` - The dictionary of pandas data frame storing the gene ranking information for each cluster. - group: `str` - The group name that points to the key for the rank_df. - genes: `list` - The list of input genes, from which the network will be constructed. - n_top_genes: `int` - Number of top genes that will be selected from to build the network. - rank_regulators - Whether the input dictionary is about ranking top regulators of each gene per cluster. - - Returns - ------- + Args: + rank_df_dict: The dictionary of pandas data frame storing the gene ranking information for each cluster. + group: The group name that points to the key for the rank_df. + genes: The list of input genes, from which the network will be constructed. + n_top_genes: Number of top genes that will be selected from to build the network. + rank_regulators: Whether the input dictionary is about ranking top regulators of each gene per cluster. + + Returns: A dataframe of interactions between input genes for the specified group of cells based on ranking information of Jacobian analysis. It has `regulator`, `target` and `weight` three columns. @@ -78,41 +75,32 @@ def get_interaction_in_cluster( def build_network_per_cluster( - adata, - cluster, - cluster_names=None, - full_reg_rank=None, - full_eff_rank=None, - genes=None, - n_top_genes=100, - abs=False, -): + adata: AnnData, + cluster: str, + cluster_names: Optional[str] = None, + full_reg_rank: Optional[Dict] = None, + full_eff_rank: Optional[Dict] = None, + genes: Optional[List] = None, + n_top_genes: int = 100, + abs: bool = False, +) -> Dict[str, pd.DataFrame]: """Build a cluster specific network between input genes based on ranking information of Jacobian analysis. - Parameters - ---------- - adata: :class:`~anndata.AnnData`. - AnnData object, must at least have gene-wise Jacobian matrix calculated for each or selected cell. - cluster: `str` - The group key that points to the columns of `adata.obs`. - cluster_names: `str` or `list` (default: `None`) - The groups whose networks will be constructed, must overlap with names in adata.obs and / or keys from the + Args: + adata: AnnData object, must at least have gene-wise Jacobian matrix calculated for each or selected cell. + cluster: The group key that points to the columns of `adata.obs`. + cluster_names: The groups whose networks will be constructed, must overlap with names in adata.obs and / or keys from the ranking dictionaries. - full_reg_rank: `dict` (default: `None`) - The dictionary stores the regulator ranking information per cluster based on cell-wise Jacobian matrix. If + full_reg_rank: The dictionary stores the regulator ranking information per cluster based on cell-wise Jacobian matrix. If None, we will call `rank_jacobian_genes(adata, groups=cluster, mode='full reg', abs=True, output_values=True)` to first obtain this dictionary. - full_eff_rank (default: `None`) - The dictionary stores the effector ranking information per cluster based on cell-wise Jacobian matrix. If + full_eff_rank: The dictionary stores the effector ranking information per cluster based on cell-wise Jacobian matrix. If None, we will call `rank_jacobian_genes(adata, , groups=cluster, mode='full eff', abs=True, output_values=True)` to first obtain this dictionary. - genes: `list` (default: `None`) - The list of input genes, from which the network will be constructed. - n_top_genes: `int` (default: `100`) - Number of top genes that will be selected from to build the network. + genes: The list of input genes, from which the network will be constructed. + n_top_genes: Number of top genes that will be selected from to build the network. - Returns - ------- + Returns: A dictionary of dataframe of interactions between input genes for each group of cells based on ranking information of Jacobian analysis. Each composite dataframe has `regulator`, `target` and `weight` three columns. """ @@ -182,23 +170,19 @@ def build_network_per_cluster( return edges_list -def adj_list_to_matrix(adj_list, only_one_edge=False, clr=False, graph=False): +def adj_list_to_matrix( + adj_list: pd.DataFrame, only_one_edge: bool = False, clr: bool = False, graph: bool = False +) -> Union[pd.DataFrame, nx.Graph]: """Convert a pandas adjacency list (with regulator, target, weight columns) to a processed adjacency matrix (or network). - Parameters - ---------- - adj_list: `pandas.DataFrae` - A pandas adjacency dataframe with regulator, target, weight columns for representing a network graph. - only_one_edge: `bool` - Whether or not to only keep the edges with higher weight for any two gene pair. - clr: `bool` - Whether to post-process the direct network via the context likelihood relatedness. - graph: `bool` - Whether a direct, weighted graph based on networkx should be returned. - - Returns - ------- + Args: + adj_list: A pandas adjacency dataframe with regulator, target, weight columns for representing a network graph. + only_one_edge: Whether or not to only keep the edges with higher weight for any two gene pair. + clr: Whether to post-process the direct network via the context likelihood relatedness. + graph: Whether a direct, weighted graph based on networkx should be returned. + + Returns: A pandas adjacency matrix or a direct, weighted graph constructed via networkx. """ diff --git a/dynamo/vectorfield/rank_vf.py b/dynamo/vectorfield/rank_vf.py new file mode 100644 index 000000000..348a5b3d7 --- /dev/null +++ b/dynamo/vectorfield/rank_vf.py @@ -0,0 +1,823 @@ +# from tqdm import tqdm + +# from anndata._core.views import ArrayView +# import scipy.sparse as sp +from typing import Callable, Dict, List, Optional, Union + +import numpy as np +import pandas as pd +from anndata._core.anndata import AnnData +from numpy.typing import DTypeLike + +from ..dynamo_logger import main_info_insert_adata_uns +from ..tools.utils import ( + create_layer, + get_rank_array, + index_gene, + list_top_genes, + list_top_interactions, + table_top_genes, +) +from ..utils import isarray, ismatrix +from .utils import average_jacobian_by_group, intersect_sources_targets + +try: + import dynode + + use_dynode = "vectorfield" in dir(dynode) +except ImportError: + use_dynode = False + +if use_dynode: + from .scVectorField import dynode_vectorfield + + +def rank_genes( + adata: AnnData, + arr_key: Union[str, np.ndarray], + groups: Optional[str] = None, + genes: Optional[List] = None, + abs: bool = False, + normalize: bool = False, + fcn_pool: Callable = lambda x: np.mean(x, axis=0), + dtype: Optional[DTypeLike] = None, + output_values: bool = False, +) -> pd.DataFrame: + """Rank gene's absolute, positive, negative vector field metrics by different cell groups. + + Args: + adata: AnnData object that contains the array to be sorted in `.var` or `.layer`. + arr_key: The key of the to-be-ranked array stored in `.var` or or `.layer`. + If the array is found in `.var`, the `groups` argument will be ignored. + If a numpy array is passed, it is used as the array to be ranked and must + be either an 1d array of length `.n_var`, or a `.n_obs`-by-`.n_var` 2d array. + groups: Cell groups used to group the array. + genes: The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. + abs: When pooling the values in the array (see below), whether to take the absolute values. + normalize: bool (default: False) + Whether normalize the array across all cells first, if the array is 2d. + fcn_pool: callable (default: numpy.mean(x, axis=0)) + The function used to pool values in the to-be-ranked array if the array is 2d. + output_values: bool (default: False) + Whether output the values along with the rankings. + + Returns: + A dataframe of gene names and values based on which the genes are sorted for each cell group. + """ + + genes, arr = get_rank_array( + adata, + arr_key, + genes=genes, + abs=abs, + dtype=dtype, + ) + + if arr.ndim > 1: + if normalize: + arr_max = np.max(np.abs(arr), axis=0) + arr = arr / arr_max + arr[np.isnan(arr)] = 0 + if groups is not None: + if type(groups) is str and groups in adata.obs.keys(): + grps = np.array(adata.obs[groups]) + elif isarray(groups): + grps = np.array(groups) + else: + raise Exception(f"The group information {groups} you provided is not in your adata object.") + arr_dict = {} + for g in np.unique(grps): + arr_dict[g] = fcn_pool(arr[grps == g]) + else: + arr_dict = {"all": fcn_pool(arr)} + else: + arr_dict = {"all": arr} + + ret_dict = {} + var_names = np.array(index_gene(adata, adata.var_names, genes)) + for g, arr in arr_dict.items(): + if ismatrix(arr): + arr = arr.A.flatten() + glst, sarr = list_top_genes(arr, var_names, None, return_sorted_array=True) + # ret_dict[g] = {glst[i]: sarr[i] for i in range(len(glst))} + ret_dict[g] = glst + if output_values: + ret_dict[g + "_values"] = sarr + return pd.DataFrame(data=ret_dict) + + +def rank_cells( + adata: AnnData, + arr_key: Union[str, np.ndarray], + groups: Optional[str] = None, + genes: Optional[List] = None, + abs: bool = False, + fcn_pool: Callable = lambda x: np.mean(x, axis=0), + dtype: Optional[DTypeLike] = None, + output_values: bool = False, +) -> pd.DataFrame: + """Rank cell's absolute, positive, negative vector field metrics by different gene groups. + + Args: + adata: AnnData object that contains the array to be sorted in `.var` or `.layer`. + arr_key: The key of the to-be-ranked array stored in `.var` or or `.layer`. + If the array is found in `.var`, the `groups` argument will be ignored. + If a numpy array is passed, it is used as the array to be ranked and must + be either an 1d array of length `.n_var`, or a `.n_obs`-by-`.n_var` 2d array. + groups: Gene groups used to group the array. + genes: The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. + abs: When pooling the values in the array (see below), whether to take the absolute values. + fcn_pool: The function used to pool values in the to-be-ranked array if the array is 2d. + + Returns: + A dataframe of cells names and values based on which the genes are sorted for each gene group. + """ + + genes, arr = get_rank_array( + adata, + arr_key, + genes=genes, + abs=abs, + dtype=dtype, + ) + arr = arr.T + + if arr.ndim > 1: + if groups is not None: + if type(groups) is str and groups in adata.var.keys(): + grps = np.array(adata.var[groups]) # check this + elif isarray(groups): + grps = np.array(groups) + else: + raise Exception(f"The group information {groups} you provided is not in your adata object.") + arr_dict = {} + for g in np.unique(grps): + arr_dict[g] = fcn_pool(arr[grps == g]) + else: + arr_dict = {"all": fcn_pool(arr)} + else: + arr_dict = {"all": arr} + + ret_dict = {} + cell_names = np.array(adata.obs_names) + for g, arr in arr_dict.items(): + if ismatrix(arr): + arr = arr.A.flatten() + glst, sarr = list_top_genes(arr, cell_names, None, return_sorted_array=True) + # ret_dict[g] = {glst[i]: sarr[i] for i in range(len(glst))} + ret_dict[g] = glst + if output_values: + ret_dict[g + "_values"] = sarr + return pd.DataFrame(data=ret_dict) + + +def rank_cell_groups( + adata: AnnData, + arr_key: Union[str, np.ndarray], + groups: Optional[str] = None, + genes: Optional[List] = None, + abs: bool = False, + fcn_pool: Callable = lambda x: np.mean(x, axis=0), + dtype: Optional[DTypeLike] = None, + output_values: bool = False, +) -> pd.DataFrame: + """Rank cell's absolute, positive, negative vector field metrics by different gene groups. + + Args: + adata: AnnData object that contains the array to be sorted in `.var` or `.layer`. + arr_key: str or :class:`~numpy.ndarray` + The key of the to-be-ranked array stored in `.var` or or `.layer`. + If the array is found in `.var`, the `groups` argument will be ignored. + If a numpy array is passed, it is used as the array to be ranked and must + be either an 1d array of length `.n_var`, or a `.n_obs`-by-`.n_var` 2d array. + groups: Gene groups used to group the array. + genes: The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. + abs: When pooling the values in the array (see below), whether to take the absolute values. + fcn_pool: The function used to pool values in the to-be-ranked array if the array is 2d. + output_values: hether output the values along with the rankings. + + Returns + A dataframe of cells names and values based on which the genes are sorted for each gene group. + """ + + genes, arr = get_rank_array( + adata, + arr_key, + genes=genes, + abs=abs, + dtype=dtype, + ) + arr = arr.T + + if arr.ndim > 1: + if groups is not None: + if type(groups) is str and groups in adata.var.keys(): + grps = np.array(adata.var[groups]) # check this + elif isarray(groups): + grps = np.array(groups) + else: + raise Exception(f"The group information {groups} you provided is not in your adata object.") + arr_dict = {} + for g in np.unique(grps): + arr_dict[g] = fcn_pool(arr[grps == g]) + else: + arr_dict = {"all": fcn_pool(arr)} + else: + arr_dict = {"all": arr} + + ret_dict = {} + cell_names = np.array(adata.obs_names) + for g, arr in arr_dict.items(): + if ismatrix(arr): + arr = arr.A.flatten() + glst, sarr = list_top_genes(arr, cell_names, None, return_sorted_array=True) + # ret_dict[g] = {glst[i]: sarr[i] for i in range(len(glst))} + ret_dict[g] = glst + if output_values: + ret_dict[g + "_values"] = sarr + return pd.DataFrame(data=ret_dict) + + +def rank_expression_genes(adata: AnnData, ekey: str = "M_s", prefix_store: str = "rank", **kwargs) -> AnnData: + """Rank genes based on their expression values for each cell group. + + Args: + adata: AnnData object that contains the normalized or locally smoothed expression. + ekey: The expression key, can be any properly normalized layers, e.g. M_s, M_u, M_t, M_n. + prefix_store: The prefix added to the key for storing the returned in adata. + kwargs: + additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: + group: str or None (default: None) + The cell group that speed ranking will be grouped-by. + genes: list or None (default: None) + The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. + abs: bool (default: False) + When pooling the values in the array (see below), whether to take the absolute values. + normalize: bool (default: False) + Whether normalize the array across all cells first, if the array is 2d. + fcn_pool: callable (default: numpy.mean(x, axis=0)) + The function used to pool values in the to-be-ranked array if the array is 2d. + output_values: bool (default: False) + Whether output the values along with the rankings. + + Returns: + AnnData object which has the rank dictionary for expression in `.uns`. + """ + + rdict = rank_genes(adata, ekey, **kwargs) + adata.uns[prefix_store + "_" + ekey] = rdict + return adata + + +def rank_velocity_genes(adata, vkey="velocity_S", prefix_store="rank", **kwargs) -> AnnData: + """Rank genes based on their raw and absolute velocities for each cell group. + + Args: + adata: AnnData object that contains the gene-wise velocities. + vkey: The velocity key. + prefix_store: The prefix added to the key for storing the returned in adata. + kwargs: + additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: + group: str or None (default: None) + The cell group that speed ranking will be grouped-by. + genes: list or None (default: None) + The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. + abs: bool (default: False) + When pooling the values in the array (see below), whether to take the absolute values. + normalize: bool (default: False) + Whether normalize the array across all cells first, if the array is 2d. + fcn_pool: callable (default: numpy.mean(x, axis=0)) + The function used to pool values in the to-be-ranked array if the array is 2d. + output_values: bool (default: False) + Whether output the values along with the rankings. + + Returns: + AnnData object which has the rank dictionary for velocities in `.uns`. + """ + rdict = rank_genes(adata, vkey, **kwargs) + rdict_abs = rank_genes(adata, vkey, abs=True, **kwargs) + adata.uns[prefix_store + "_" + vkey] = rdict + adata.uns[prefix_store + "_abs_" + vkey] = rdict_abs + return adata + + +def rank_divergence_genes( + adata: AnnData, + jkey: str = "jacobian_pca", + genes: Optional[List] = None, + prefix_store: str = "rank_div_gene", + **kwargs, +) -> pd.DataFrame: + """Rank genes based on their diagonal Jacobian for each cell group. + Be aware that this 'divergence' refers to the diagonal elements of a gene-wise + Jacobian, rather than its trace, which is the common definition of the divergence. + + Run .vf.jacobian and set store_in_adata=True before using this function. + + Args: + adata: AnnData object that contains the reconstructed vector field in the `.uns` attribute. + jkey: The key in .uns of the cell-wise Jacobian matrix. + genes: A list of names for genes of interest. + prefix_store: The prefix added to the key for storing the returned ranking info in adata. + kwargs: + additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: + group: str or None (default: None) + The cell group that speed ranking will be grouped-by. + genes: list or None (default: None) + The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. + abs: bool (default: False) + When pooling the values in the array (see below), whether to take the absolute values. + normalize: bool (default: False) + Whether normalize the array across all cells first, if the array is 2d. + fcn_pool: callable (default: numpy.mean(x, axis=0)) + The function used to pool values in the to-be-ranked array if the array is 2d. + output_values: bool (default: False) + Whether output the values along with the rankings. + + Returns: + AnnData object which has the rank dictionary for diagonal jacobians in `.uns`. + """ + + if jkey not in adata.uns_keys(): + raise Exception(f"The provided dictionary key {jkey} is not in .uns.") + + reg = [x for x in adata.uns[jkey]["regulators"]] + eff = [x for x in adata.uns[jkey]["effectors"]] + if reg != eff: + raise Exception("The Jacobian should have the same regulators and effectors.") + else: + Genes = adata.uns[jkey]["regulators"] + cell_idx = adata.uns[jkey]["cell_idx"] + div = np.einsum("iij->ji", adata.uns[jkey]["jacobian_gene"]) + Div = create_layer(adata, div, genes=Genes, cells=cell_idx, dtype=np.float32) + + if genes is not None: + Genes = list(set(Genes).intersection(genes)) + + rdict = rank_genes( + adata, + Div, + fcn_pool=lambda x: np.nanmean(x, axis=0), + genes=Genes, + **kwargs, + ) + adata.uns[prefix_store + "_" + jkey] = rdict + return rdict + + +def rank_s_divergence_genes( + adata: AnnData, + skey: str = "sensitivity_pca", + genes: Optional[List] = None, + prefix_store: str = "rank_s_div_gene", + **kwargs, +) -> pd.DataFrame: + """Rank genes based on their diagonal Sensitivity for each cell group. + Be aware that this 'divergence' refers to the diagonal elements of a gene-wise + Sensitivity, rather than its trace, which is the common definition of the divergence. + + Run .vf.sensitivity and set store_in_adata=True before using this function. + + Args: + adata: AnnData object that contains the reconstructed vector field in the `.uns` attribute. + skey: The key in .uns of the cell-wise sensitivity matrix. + genes: A list of names for genes of interest. + prefix_store: The prefix added to the key for storing the returned ranking info in adata. + kwargs: + additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: + group: str or None (default: None) + The cell group that speed ranking will be grouped-by. + genes: list or None (default: None) + The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. + abs: bool (default: False) + When pooling the values in the array (see below), whether to take the absolute values. + normalize: bool (default: False) + Whether normalize the array across all cells first, if the array is 2d. + fcn_pool: callable (default: numpy.mean(x, axis=0)) + The function used to pool values in the to-be-ranked array if the array is 2d. + output_values: bool (default: False) + Whether output the values along with the rankings. + + Returns: + adata: AnnData object which has the rank dictionary for diagonal sensitivity in `.uns`. + """ + + if skey not in adata.uns_keys(): + raise Exception(f"The provided dictionary key {skey} is not in .uns.") + + reg = [x for x in adata.uns[skey]["regulators"]] + eff = [x for x in adata.uns[skey]["effectors"]] + if reg != eff: + raise Exception("The Jacobian should have the same regulators and effectors.") + else: + Genes = adata.uns[skey]["regulators"] + cell_idx = adata.uns[skey]["cell_idx"] + div = np.einsum("iij->ji", adata.uns[skey]["sensitivity_gene"]) + Div = create_layer(adata, div, genes=Genes, cells=cell_idx, dtype=np.float32) + + if genes is not None: + Genes = list(set(Genes).intersection(genes)) + + rdict = rank_genes( + adata, + Div, + fcn_pool=lambda x: np.nanmean(x, axis=0), + genes=Genes, + **kwargs, + ) + adata.uns[prefix_store + "_" + skey] = rdict + return rdict + + +def rank_acceleration_genes(adata, akey="acceleration", prefix_store="rank", **kwargs) -> AnnData: + """Rank genes based on their absolute, positive, negative accelerations for each cell group. + + Args: + adata: AnnData object that contains the reconstructed vector field function in the `uns` attribute. + akey: The acceleration key. + prefix_store: The prefix of the key that will be used to store the acceleration rank result. + kwargs: + additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: + group: str or None (default: None) + The cell group that speed ranking will be grouped-by. + genes: list or None (default: None) + The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. + abs: bool (default: False) + When pooling the values in the array (see below), whether to take the absolute values. + normalize: bool (default: False) + Whether normalize the array across all cells first, if the array is 2d. + fcn_pool: callable (default: numpy.mean(x, axis=0)) + The function used to pool values in the to-be-ranked array if the array is 2d. + output_values: bool (default: False) + Whether output the values along with the rankings. + + Returns: + adata: AnnData object that is updated with the `'rank_acceleration'` information in the `.uns`. + """ + + rdict = rank_genes(adata, akey, **kwargs) + rdict_abs = rank_genes(adata, akey, abs=True, **kwargs) + adata.uns[prefix_store + "_" + akey] = rdict + adata.uns[prefix_store + "_abs_" + akey] = rdict_abs + return adata + + +def rank_curvature_genes(adata: AnnData, ckey: str = "curvature", prefix_store: str = "rank", **kwargs): + """Rank gene's absolute, positive, negative curvature by different cell groups. + + Args: + adata: AnnData object that contains the reconstructed vector field function in the `.uns` attribute. + ckey: The curvature key. + prefix_store: The prefix of the key that will be used to store the acceleration rank result. + kwargs: + additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: + group: str or None (default: None) + The cell group that speed ranking will be grouped-by. + genes: list or None (default: None) + The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. + abs: bool (default: False) + When pooling the values in the array (see below), whether to take the absolute values. + normalize: bool (default: False) + Whether normalize the array across all cells first, if the array is 2d. + fcn_pool: callable (default: numpy.mean(x, axis=0)) + The function used to pool values in the to-be-ranked array if the array is 2d. + output_values: bool (default: False) + Whether output the values along with the rankings. + + Returns: + AnnData object that is updated with the `'rank_curvature'` related information in the .uns. + """ + rdict = rank_genes(adata, ckey, **kwargs) + rdict_abs = rank_genes(adata, ckey, abs=True, **kwargs) + adata.uns[prefix_store + "_" + ckey] = rdict + adata.uns[prefix_store + "_abs_" + ckey] = rdict_abs + return adata + + +def rank_jacobian_genes( + adata: AnnData, + groups: Optional[str] = None, + jkey: str = "jacobian_pca", + abs: bool = False, + mode: str = "full reg", + exclude_diagonal: bool = False, + normalize: bool = False, + return_df: bool = False, + **kwargs, +) -> Optional[pd.DataFrame]: + """Rank genes or gene-gene interactions based on their Jacobian elements for each cell group. + + Run .vf.jacobian and set store_in_adata=True before using this function. + + Args: + adata: AnnData object that contains the reconstructed vector field in the `.uns` attribute. + groups: Cell groups used to group the Jacobians. + jkey: The key of the stored Jacobians in `.uns`. + abs: Whether take the absolute value of the Jacobian. + mode: {'full reg', 'full eff', 'reg', 'eff', 'int', 'switch'} (default: 'full_reg') + The mode of ranking: + (1) `'full reg'`: top regulators are ranked for each effector for each cell group; + (2) `'full eff'`: top effectors are ranked for each regulator for each cell group; + (3) '`reg`': top regulators in each cell group; + (4) '`eff`': top effectors in each cell group; + (5) '`int`': top effector-regulator pairs in each cell group. + (6) '`switch`': top effector-regulator pairs that show mutual inhibition pattern in each cell group. + exclude_diagonal: Whether to consider the self-regulation interactions (diagnoal of the jacobian matrix) + normalize: Whether to normalize the Jacobian across all cells before performing the ranking. + return_df: Whether to return the data or to save results in adata object via the key `mode` of adata.uns. + kwargs: Keyword arguments passed to ranking functions. + + Returns: + rank_info: + different modes return different types of return values + 1. full reg and full eff: + A pandas dataframe containing ranking info based on Jacobian elements + 2. reg eff int: + A dictionary object whose keys correspond to groups, and whose values are + specific rank's pd dataframe + """ + J_dict = adata.uns[jkey] + J = J_dict["jacobian_gene"] + if abs: + J = np.abs(J) + + if normalize: + Jmax = np.max(np.abs(J), axis=2) + for i in range(J.shape[2]): + J[:, :, i] /= Jmax + + if mode == "switch": + J_transpose = J.transpose(1, 0, 2) + J_mul = J * J_transpose + # switch genes will have negative Jacobian between any two gene pairs + # only True * True = 1, so only the gene pair with both negative Jacobian, this will be non-zero: + J = J_mul * (np.sign(J) == -1) * (np.sign(J_transpose) == -1) + + if groups is None: + J_mean = {"all": np.mean(J, axis=2)} + else: + if type(groups) is str and groups in adata.obs.keys(): + grps = np.array(adata.obs[groups]) + elif isarray(groups): + grps = np.array(groups) + else: + raise Exception(f"The group information {groups} you provided is not in your adata object.") + J_mean = average_jacobian_by_group(J, grps[J_dict["cell_idx"]]) + + eff = np.array([x for x in J_dict["effectors"]]) + reg = np.array([x for x in J_dict["regulators"]]) + rank_dict = {} + ov = kwargs.pop("output_values", True) + if mode in ["full reg", "full_reg"]: + for k, J in J_mean.items(): + rank_dict[k] = table_top_genes(J, eff, reg, n_top_genes=None, output_values=ov, **kwargs) + elif mode in ["full eff", "full_eff"]: + for k, J in J_mean.items(): + rank_dict[k] = table_top_genes(J.T, reg, eff, n_top_genes=None, output_values=ov, **kwargs) + elif mode == "reg": + for k, J in J_mean.items(): + if exclude_diagonal: + for i, ef in enumerate(eff): + ii = np.where(reg == ef)[0] + if len(ii) > 0: + J[i, ii] = np.nan + j = np.nanmean(J, axis=0) + if ov: + rank_dict[k], rank_dict[k + "_values"] = list_top_genes( + j, reg, None, return_sorted_array=True, **kwargs + ) + else: + rank_dict[k] = list_top_genes(j, reg, None, **kwargs) + rank_dict = pd.DataFrame(data=rank_dict) + elif mode == "eff": + for k, J in J_mean.items(): + if exclude_diagonal: + for i, re in enumerate(reg): + ii = np.where(eff == re)[0] + if len(ii) > 0: + J[ii, i] = np.nan + j = np.nanmean(J, axis=1) + if ov: + rank_dict[k], rank_dict[k + "_values"] = list_top_genes( + j, eff, None, return_sorted_array=True, **kwargs + ) + else: + rank_dict[k] = list_top_genes(j, eff, None, **kwargs) + rank_dict = pd.DataFrame(data=rank_dict) + elif mode in ["int", "switch"]: + for k, J in J_mean.items(): + ints, vals = list_top_interactions(J, eff, reg, **kwargs) + rank_dict[k] = [] + if ov: + rank_dict[k + "_values"] = [] + for ind, int_val in enumerate(ints): + if not (exclude_diagonal and int_val[0] == int_val[1]): + rank_dict[k].append(int_val[0] + " - " + int_val[1]) + if ov: + rank_dict[k + "_values"].append(vals[ind]) + rank_dict = pd.DataFrame(data=rank_dict) + else: + raise ValueError(f"No such mode as {mode}.") + + if return_df: + return rank_dict + else: + main_info_insert_adata_uns(mode) + adata.uns[mode] = rank_dict + + +def rank_sensitivity_genes( + adata: AnnData, + groups: Optional[str] = None, + skey: str = "sensitivity_pca", + abs: bool = False, + mode: str = "full reg", + exclude_diagonal: bool = False, + **kwargs, +) -> pd.DataFrame: + """Rank genes or gene-gene interactions based on their sensitivity elements for each cell group. + + Run .vf.sensitivity and set store_in_adata=True before using this function. + + Args: + adata: AnnData object that contains the reconstructed vector field in the `.uns` attribute. + groups: Cell groups used to group the sensitivity. + skey: The key of the stored sensitivity in `.uns`. + abs: Whether or not to take the absolute value of the Jacobian. + mode: {'full reg', 'full eff', 'reg', 'eff', 'int'} (default: 'full_reg') + The mode of ranking: + (1) `'full reg'`: top regulators are ranked for each effector for each cell group; + (2) `'full eff'`: top effectors are ranked for each regulator for each cell group; + (3) '`reg`': top regulators in each cell group; + (4) '`eff`': top effectors in each cell group; + (5) '`int`': top effector-regulator pairs in each cell group. + exclude_diagonal: Whether to consider the self-regulation interactions (diagnoal of the jacobian matrix) + kwargs: Keyword arguments passed to ranking functions. + + Returns: + AnnData object which has the rank dictionary in `.uns`. + """ + S_dict = adata.uns[skey] + S = S_dict["sensitivity_gene"] + if abs: + S = np.abs(S) + if groups is None: + S_mean = {"all": np.mean(S, axis=2)} + else: + if type(groups) is str and groups in adata.obs.keys(): + grps = np.array(adata.obs[groups]) + elif isarray(groups): + grps = np.array(groups) + else: + raise Exception(f"The group information {groups} you provided is not in your adata object.") + S_mean = average_jacobian_by_group(S, grps[S_dict["cell_idx"]]) + + eff = np.array([x for x in S_dict["effectors"]]) + reg = np.array([x for x in S_dict["regulators"]]) + rank_dict = {} + if mode in ["full reg", "full_reg"]: + for k, S in S_mean.items(): + rank_dict[k] = table_top_genes(S, eff, reg, n_top_genes=None, **kwargs) + elif mode in ["full eff", "full_eff"]: + for k, S in S_mean.items(): + rank_dict[k] = table_top_genes(S.T, reg, eff, n_top_genes=None, **kwargs) + elif mode == "reg": + ov = kwargs.pop("output_values", False) + for k, S in S_mean.items(): + if exclude_diagonal: + for i, ef in enumerate(eff): + ii = np.where(reg == ef)[0] + if len(ii) > 0: + S[i, ii] = np.nan + j = np.nanmean(S, axis=0) + if ov: + rank_dict[k], rank_dict[k + "_values"] = list_top_genes( + j, reg, None, return_sorted_array=True, **kwargs + ) + else: + rank_dict[k] = list_top_genes(j, reg, None, **kwargs) + rank_dict = pd.DataFrame(data=rank_dict) + elif mode == "eff": + ov = kwargs.pop("output_values", False) + for k, S in S_mean.items(): + if exclude_diagonal: + for i, re in enumerate(reg): + ii = np.where(eff == re)[0] + if len(ii) > 0: + S[ii, i] = np.nan + j = np.nanmean(S, axis=1) + if ov: + rank_dict[k], rank_dict[k + "_values"] = list_top_genes( + j, eff, None, return_sorted_array=True, **kwargs + ) + else: + rank_dict[k] = list_top_genes(j, eff, None, **kwargs) + rank_dict = pd.DataFrame(data=rank_dict) + elif mode == "int": + ov = kwargs.pop("output_values", False) + for k, S in S_mean.items(): + ints, vals = list_top_interactions(S, eff, reg, **kwargs) + rank_dict[k] = [] + if ov: + rank_dict[k + "_values"] = [] + for ind, int_val in enumerate(ints): + if not (exclude_diagonal and int_val[0] == int_val[1]): + rank_dict[k].append(int_val[0] + " - " + int_val[1]) + if ov: + rank_dict[k + "_values"].append(vals[ind]) + rank_dict = pd.DataFrame(data=rank_dict) + else: + raise ValueError(f"No such mode as {mode}.") + return rank_dict + + +# --------------------------------------------------------------------------------------------------- +# aggregate regulators or targets +def aggregateRegEffs( + adata: AnnData, + data_dict: Optional[Dict] = None, + reg_dict: Optional[Dict] = None, + eff_dict: Optional[Dict] = None, + key: str = "jacobian", + basis: str = "pca", + store_in_adata: bool = True, +) -> Union[AnnData, Dict]: + """Aggregate multiple genes' Jacobian or sensitivity. + + Args: + adata: AnnData object that contains the reconstructed vector field in `.uns`. + data_dict: A dictionary corresponds to the Jacobian or sensitivity information, must be calculated with either: + `dyn.vf.jacobian(adata, basis='pca', regulators=genes, effectors=genes)` or + `dyn.vf.sensitivity(adata, basis='pca', regulators=genes, effectors=genes)` + reg_dict: A dictionary in which keys correspond to regulator-groups (i.e. TFs for specific cell type) while values + a list of genes that must have at least one overlapped genes with that from the Jacobian or sensitivity + dict. + eff_dict: A dictionary in which keys correspond to effector-groups (i.e. markers for specific cell type) while values + a list of genes that must have at least one overlapped genes with that from the Jacobian or sensitivity + dict. + key: The key in .uns that corresponds to the Jacobian or sensitivity matrix information. + basis: The embedding data in which the vector field was reconstructed. If `None`, use the vector field function + that was reconstructed directly from the original unreduced gene expression space. + store_in_adata: hether to store the divergence result in adata. + + + Returns: + Depending on `store_in_adata`, it will either return a dictionary that include the aggregated Jacobian or + sensitivity information or the updated AnnData object that is updated with the `'aggregation'` key in the + `.uns`. This dictionary contains a 3-dimensional tensor with dimensions n_obs x n_regulators x n_effectors + as well as other information. + """ + + key_ = key if basis is None else key + "_" + basis + data_dict = adata.uns[key_] if data_dict is None else data_dict + + tensor, cell_idx, tensor_gene, regulators_, effectors_ = ( + data_dict.get(key), + data_dict.get("cell_idx"), + data_dict.get(key + "_gene"), + data_dict.get("regulators"), + data_dict.get("effectors"), + ) + + Aggregation = np.zeros((len(eff_dict), len(reg_dict), len(cell_idx))) + reg_ind = 0 + for reg_key, reg_val in reg_dict.items(): + eff_ind = 0 + for eff_key, eff_val in eff_dict.items(): + reg_val, eff_val = ( + list(np.unique(reg_val)) if reg_val is not None else None, + list(np.unique(eff_val)) if eff_val is not None else None, + ) + + Der, source_genes, target_genes = intersect_sources_targets( + reg_val, + regulators_, + eff_val, + effectors_, + tensor if tensor_gene is None else tensor_gene, + ) + if len(source_genes) + len(target_genes) > 0: + Aggregation[eff_ind, reg_ind, :] = Der.sum(axis=(0, 1)) # dim 0: target; dim 1: source + else: + Aggregation[eff_ind, reg_ind, :] = np.nan + eff_ind += 1 + reg_ind += 0 + + ret_dict = {"aggregation": None, "cell_idx": cell_idx} + # use 'str_key' in dict.keys() to check if these items are computed, or use dict.get('str_key') + if Aggregation is not None: + ret_dict["aggregation_gene"] = Aggregation + if reg_dict.keys() is not None: + ret_dict["regulators"] = list(reg_dict.keys()) + if eff_dict.keys() is not None: + ret_dict["effectors"] = list(eff_dict.keys()) + + det = [np.linalg.det(Aggregation[:, :, i]) for i in np.arange(Aggregation.shape[2])] + key = key + "_aggregation" if basis is None else key + "_aggregation_" + basis + adata.obs[key + "_det"] = np.nan + adata.obs[key + "_det"][cell_idx] = det + if store_in_adata: + adata.uns[key] = ret_dict + return adata + else: + return ret_dict diff --git a/dynamo/vectorfield/scPotential.py b/dynamo/vectorfield/scPotential.py index 8cd16086a..a2334d76e 100755 --- a/dynamo/vectorfield/scPotential.py +++ b/dynamo/vectorfield/scPotential.py @@ -1,8 +1,10 @@ from warnings import warn +from anndata._core.anndata import AnnData import numpy as np import scipy as sp import scipy.optimize +from typing import Callable, List, Optional, Tuple, Union from ..tools.sampling import lhsclassic from .Ao import Ao_pot_map @@ -19,17 +21,37 @@ def search_fixed_points( - func, - domain, - x0, - x0_method="lhs", - reverse=False, - return_x0=False, - fval_tol=1e-8, - remove_outliers=True, - ignore_fsolve_err=False, + func: Callable, + domain: np.ndarray, + x0: np.ndarray, + x0_method: str = "lhs", + reverse: bool = False, + return_x0: bool = False, + fval_tol: float = 1e-8, + remove_outliers: bool = True, + ignore_fsolve_err: bool = False, **fsolve_kwargs -): +) -> Union[FixedPoints, Tuple[FixedPoints, np.ndarray]]: + """Search the fixed points of (learned) vector field function in a given domain. + + The initial points are sampled by given methods. Then the function uses the fsolve function + from SciPy to find the fixed points and Numdifftools to compute the Jacobian matrix of the function. + + Args: + func: The function of the (learned) vector field function that are required to fixed points for. + domain: The domain to search in. + x0: The initial point to start with. + x0_method: The method to sample initial points. + reverse: Whether to reverse the sign (direction) of vector field (VF). + return_x0: Whether to return the initial points used in the search. + fval_tol: The tolerance for the function value at the fixed points. + remove_outliers: Whether to remove the outliers. + ignore_fsolve_err: Whether to ignore the fsolve error. + + Returns: + The fixed points found with their Jacobian matrix of the function. The sampled initial points + will be returned as well if return_x0 == True. + """ import numdifftools as nda func_ = (lambda x: -func(x)) if reverse else func @@ -86,44 +108,33 @@ def search_fixed_points( def gen_fixed_points( - func, - auto_func, - dim_range, - RandNum, - EqNum, - reverse=False, - grid_num=50, - x_ini=None, -): + func: Callable, + auto_func: Optional[np.ndarray], + dim_range: List, + RandNum: int, + EqNum: int, + reverse: bool = False, + grid_num: int = 50, + x_ini: Optional[np.ndarray] = None, +) -> Tuple[np.ndarray, np.ndarray]: """Calculate the fixed points of (learned) vector field function . Classify the fixed points into classes of stable and saddle points based on the eigenvalue of the Jacobian on the point. - Arguments - --------- - func: 'function' - The function of the (learned) vector field function that are required to fixed points for - auto_func: 'np.ndarray' (not used) - The function that is written with autograd of the same ODE equations that is used to calculate the Jacobian matrix. - If auto_func is set to be None, Jacobian is calculated through the fjac, r returned from fsolve. - dim_range: 'list' - The range of variables in the ODE equations - RandNum: 'int' - The number of random initial points to sample - EqNum: 'int' - The number of equations (dimension) of the system - reverse: `bool` - Whether to reverse the sign (direction) of vector field (VF). - grid_num: `int` (default: 50) - The number of grids on each dimension, only used when the EqNum is 2 and x_ini is None. - x_ini: 'np.ndarray' - The user provided initial points that is used to find the fixed points - - Returns - ------- - stable: 'np.ndarray' - A matrix consists of the coordinates of the stable steady state - saddle: 'np.ndarray' - A matrix consists of the coordinates of the unstable steady state + Args: + func: The function of the (learned) vector field function that are required to fixed points for + auto_func: The function that is written with autograd of the same ODE equations + that is used to calculate the Jacobian matrix. If auto_func is set to be None, + Jacobian is calculated through the fjac, r returned from fsolve. + dim_range: The range of variables in the ODE equations. + RandNum: The number of random initial points to sample. + EqNum: The number of equations (dimension) of the system. + reverse: Whether to reverse the sign (direction) of vector field (VF). + grid_num: The number of grids on each dimension, only used when the EqNum is 2 and x_ini is None. + x_ini: The user provided initial points that is used to find the fixed points + + Returns: + stable: A matrix consists of the coordinates of the stable steady state + saddle: A matrix consists of the coordinates of the unstable steady state """ import numdifftools as nda @@ -232,26 +243,23 @@ def gen_fixed_points( return stable, saddle -def gen_gradient(dim, N, Function, DiffusionMatrix): +def gen_gradient( + dim: int, + N: int, + Function: Callable, + DiffusionMatrix: Callable, +) -> Tuple[np.ndarray, np.ndarray]: """Calculate the gradient of the (learned) vector field function for the least action path (LAP) symbolically - Arguments - --------- - dim: 'int' - The number of dimension of the system - N: 'int' - The number of the points on the discretized path of the LAP - Function: 'function' - The function of the (learned) vector field function that is needed to calculate the Jacobian matrix - DiffusionMatrix: Python function - The function that returns the diffusion matrix which can be variable (for example, gene) dependent + Args: + dim: The number of dimension of the system. + N: The number of the points on the discretized path of the LAP. + Function: The function of the (learned) vector field function that is needed to calculate the Jacobian matrix. + DiffusionMatrix: The function that returns the diffusion matrix which can be variable (e.g. gene) dependent - Returns - ------- - ret: 'np.ndarray' - The symbolic function that calculates the gradient of the LAP based on the Jacobian of the vector field function - V: 'np.ndarray' - A matrix consists of the coordinates of the unstable steady state + Returns: + ret: The symbolic function that calculates the gradient of the LAP based on the Jacobian of the vector field function. + V: A matrix consists of the coordinates of the unstable steady state. """ from StringFunction import StringFunction @@ -307,24 +315,22 @@ def graident(dt, x): ################################################## -def IntGrad(points, Function, DiffusionMatrix, dt): +def IntGrad( + points: np.ndarray, + Function: Callable, + DiffusionMatrix: Callable, + dt: float, +) -> np.ndarray: """Calculate the action of the path based on the (reconstructed) vector field function and diffusion matrix (Eq. 18) - Arguments - --------- - points: :class:`~numpy.ndarray` - The sampled points in the state space used to calculate the action. - Function: function - The (learned) vector field function. - DiffusionMatrix: function - The function that returns diffusion matrix which can be dependent on the variables (for example, genes) - dt: 'float' - The time interval used in calculating action + Arg: + points: The sampled points in the state space used to calculate the action. + Function: The (learned) vector field function. + DiffusionMatrix: The function that returns diffusion matrix which can be dependent on the variables (for example, genes). + dt: The time interval used in calculating action. - Returns - ------- - integral: 'np.ndarray' - The action calculated based on the input path, the vector field function and the diffusion matrix. + Returns: + integral: The action calculated based on the input path, the vector field function and the diffusion matrix. """ integral = 0 @@ -338,18 +344,14 @@ def IntGrad(points, Function, DiffusionMatrix, dt): return integral[0, 0] -def DiffusionMatrix(x): +def DiffusionMatrix(x: np.ndarray) -> np.ndarray: """Diffusion matrix can be variable dependent - Arguments - --------- - x: :class:`~numpy.ndarray` - The matrix of sampled points (cells) in the (gene expression) state space. A + Args: + x: The matrix of sampled points (cells) in the (gene expression) state space. A - Returns - ------- - out: 'np.ndarray' - The diffusion matrix. By default, it is a diagonal matrix. + Returns: + out: The diffusion matrix. By default, it is a diagonal matrix. """ out = np.zeros((x.shape[0], x.shape[0])) np.fill_diagonal(out, 1) @@ -357,33 +359,30 @@ def DiffusionMatrix(x): return out -def action(n_points, tmax, point_start, point_end, boundary, Function, DiffusionMatrix): +def action( + n_points: int, + tmax: int, + point_start: np.ndarray, + point_end: np.ndarray, + boundary: np.ndarray, + Function: Callable, + DiffusionMatrix: Callable, +) -> Tuple[np.ndarray, np.ndarray]: """It calculates the minimized action value given an initial path, ODE, and diffusion matrix. The minimization is realized by scipy.optimize.Bounds function in python (without using the gradient of the action function). - Arguments - --------- - n_points: 'int' - The number of points along the least action path. - tmax: 'int' - The value at maximum t. - point_start: :class:`~numpy.ndarray` - The matrix for storing the coordinates (gene expression configuration) of the start point (initial cell state). - point_end: :class:`~numpy.ndarray` - The matrix for storing the coordinates (gene expression configuration) of the end point (terminal cell state). - boundary: :class:`~numpy.ndarray` - Not used. - Function: function - The (reconstructed) vector field function. - DiffusionMatrix: function - The function that returns the diffusion matrix which can variable (for example, gene) dependent. - - Returns - ------- - fval: :class:`~numpy.ndarray` - The action value for the learned least action path. - output_path: :class:`~numpy.ndarray` - The least action path learned + Args: + n_points: The number of points along the least action path. + tmax: The value at maximum t. + point_start: The matrix for storing the coordinates (gene expression configuration) of the start point (initial cell state). + point_end: The matrix for storing the coordinates (gene expression configuration) of the end point (terminal cell state). + boundary: Not used. + Function: The (reconstructed) vector field function. + DiffusionMatrix: The function that returns the diffusion matrix which can variable (for example, gene) dependent. + + Returns: + fval: The action value for the learned least action path. + output_path: The least action path learned. """ dim = point_end.shape[0] # genes x cells @@ -419,7 +418,12 @@ def action(n_points, tmax, point_start, point_end, boundary, Function, Diffusion return fval, output_path -def Potential(adata, DiffMat=None, method="Ao", **kwargs): +def Potential( + adata: AnnData, + DiffMat: Optional[Callable] = None, + method: str = "Ao", + **kwargs +) -> AnnData: """Function to map out the pseudo-potential landscape. Although it is appealing to define “potential” for biological systems as it is intuitive and familiar from other @@ -437,15 +441,13 @@ def Potential(adata, DiffMat=None, method="Ao", **kwargs): Parameters ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains embedding and velocity data - method: `str` (default: `Ao`) - Method to map the potential landscape. + adata: AnnData object that contains embedding and velocity data. + DiffMat: The function which returns the diffusion matrix which can variable (for example, gene) dependent. + method: Method to map the potential landscape. Returns ------- - adata: :class:`~anndata.AnnData` - `AnnData` object that is updated with the `Pot` dictionary in the `uns` attribute. + adata: `AnnData` object that is updated with the `Pot` dictionary in the `uns` attribute. """ @@ -460,41 +462,35 @@ def Potential(adata, DiffMat=None, method="Ao", **kwargs): class Pot: def __init__( self, - Function=None, - DiffMat=None, - boundary=None, - n_points=25, - fixed_point_only=False, - find_fixed_points=False, - refpoint=None, - stable=None, - saddle=None, + Function: Callable = None, + DiffMat: Callable = None, + boundary: List = None, + n_points: int = 25, + fixed_point_only: bool = False, + find_fixed_points: bool = False, + refpoint: Optional[np.ndarray] = None, + stable: Optional[np.ndarray] = None, + saddle: Optional[np.ndarray] = None, ): """It implements the least action method to calculate the potential values of fixed points for a given SDE (stochastic differential equation) model. The function requires the vector field function and a diffusion matrix. This code is based on the MATLAB code from Ruoshi Yuan and Ying Tang. Potential landscape of high dimensional nonlinear stochastic dynamics with large noise. Y Tang, R Yuan, G Wang, X Zhu, P Ao - Scientific reports, 2017 - Arguments - --------- - Function: 'function' - The (reconstructed) vector field function. - DiffMat: 'function' - The function that returns the diffusion matrix which can variable (for example, gene) dependent. - boundary: 'list' - The range of variables (genes). - n_points: 'int' - The number of points along the least action path. - fixed_point_only: 'bool' - The logic flag to determine whether only the potential for fixed point or entire space should be mapped. - find_fixed_points: 'bool' - The logic flag to determine whether only the gen_fixed_points function should be run to identify fixed points. - refpoint: 'np.ndarray' - The reference point to define the potential. - stable: 'np.ndarray' - The matrix for storing the coordinates (gene expression configuration) of the stable fixed point (characteristic state of a particular cell type). - saddle: 'np.ndarray' - The matrix for storing the coordinates (gene expression configuration) of the unstable fixed point (characteristic state of cells prime to bifurcation). + Args: + Function: The (reconstructed) vector field function. + DiffMat: The function that returns the diffusion matrix which can variable (for example, gene) dependent. + boundary: The range of variables (genes). + n_points: The number of points along the least action path. + fixed_point_only: The logic flag to determine whether only the potential + for fixed point or entire space should be mapped. + find_fixed_points: The logic flag to determine whether only the gen_fixed_points function + should be run to identify fixed points. + refpoint: The reference point to define the potential. + stable: The matrix for storing the coordinates (gene expression configuration) + of the stable fixed point (characteristic state of a particular cell type). + saddle: The matrix for storing the coordinates (gene expression configuration) + of the unstable fixed point (characteristic state of cells prime to bifurcation). """ self.VecFld = { @@ -514,16 +510,16 @@ def __init__( def fit( self, - adata, - x_lim, - y_lim, - basis="umap", - method="Ao", - xyGridSpacing=2, - dt=1e-2, - tol=1e-2, - numTimeSteps=1400, - ): + adata: AnnData, + x_lim: List, + y_lim: List, + basis: str = "umap", + method: str = "Ao", + xyGridSpacing: int = 2, + dt: float = 1e-2, + tol: float= 1e-2, + numTimeSteps: int =1400, + ) -> AnnData: """Function to map out the pseudo-potential landscape. Although it is appealing to define “potential” for biological systems as it is intuitive and familiar from other @@ -539,36 +535,29 @@ def fit( This function implements the Ao, Bhattacharya method and Ying method and will also support other methods shortly. - Arguments - --------- - adata: :class:`~anndata.AnnData` - AnnData object that contains U_grid and V_grid data - x_lim: `list` - Lower or upper limit of x-axis. - y_lim: `list` - Lower or upper limit of y-axis - basis: `str` (default: umap) - The dimension reduction method to use. - method: 'string' (default: Bhattacharya) - Method used to map the pseudo-potential landscape. By default, it is Bhattacharya (A deterministic map of + Args: + adata: AnnData object that contains U_grid and V_grid data. + x_lim: Lower or upper limit of x-axis. + y_lim: Lower or upper limit of y-axis. + basis: The dimension reduction method to use. + method: Method used to map the pseudo-potential landscape. By default, it is Bhattacharya (A deterministic map of Waddington’s epigenetic landscape for cell fate specification. Sudin Bhattacharya, Qiang Zhang and Melvin E. Andersen). Other methods will be supported include: Tang (), Ping (), Wang (), Zhou (). - - Returns - ------- - if Bhattacharya is used: - Xgrid: 'np.ndarray' - The X grid to visualize "potential surface" - Ygrid: 'np.ndarray' - The Y grid to visualize "potential surface" - Zgrid: 'np.ndarray' - The interpolate potential corresponding to the X,Y grids. - - if Tang method is used: - retmat: 'np.ndarray' - The action value for the learned least action path. - LAP: 'np.ndarray' - The least action path learned + xyGridSpacing: Grid spacing for "starting points" for each "path" on the potential surface + dt: Time step for the path integral. + tol: Tolerance to test for convergence. + numTimeSteps: A high-enough number for convergence with given dt. + + Returns: + The AnnData object updated with the following values: + if Bhattacharya is used: + Xgrid: The X grid to visualize "potential surface" + Ygrid: The Y grid to visualize "potential surface" + Zgrid: The interpolate potential corresponding to the X,Y grids. + + if Tang method is used: + retmat: The action value for the learned least action path. + LAP: The least action path learned """ if method == "Ao": diff --git a/dynamo/vectorfield/scVectorField.py b/dynamo/vectorfield/scVectorField.py index 8b828888f..05e8abfdc 100755 --- a/dynamo/vectorfield/scVectorField.py +++ b/dynamo/vectorfield/scVectorField.py @@ -1,13 +1,23 @@ import functools import itertools +import sys import warnings +from abc import abstractmethod from multiprocessing.dummy import Pool as ThreadPool -from typing import Callable, Dict, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union +if sys.version_info >= (3, 8): + from typing import Literal +else: + from typing_extensions import Literal + +import matplotlib import numpy as np import numpy.matlib import scipy.sparse as sp +from anndata import AnnData from numpy import format_float_scientific as scinot +from pynndescent import NNDescent from scipy.linalg import lstsq from scipy.spatial.distance import pdist from sklearn.neighbors import NearestNeighbors @@ -32,6 +42,8 @@ Jacobian_rkhs_gaussian, Jacobian_rkhs_gaussian_parallel, Laplacian, + NormDict, + VecFldDict, compute_acceleration, compute_curl, compute_curvature, @@ -51,8 +63,7 @@ def norm( X: np.ndarray, V: np.ndarray, T: Optional[np.ndarray] = None, fix_velocity: bool = True ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str, np.ndarray]]: - """ - Normalizes the X, Y (X + V) matrix to have zero means and unit covariance. + """Normalizes the X, Y (X + V) matrix to have zero means and unit covariance. We use the mean of X, Y's center (mean) and scale parameters (standard deviation) to normalize T. Args: @@ -64,9 +75,8 @@ def norm( fix_velocity: Whether to fix velocity and don't transform it. Default is True. Returns: - Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, Dict[str, numpy.ndarray]]: - updated X, V, T and norm_dict which includes the mean and scale values for original X, V data used - in normalization. + A tuple of updated X, V, T and norm_dict which includes the mean and scale values for original X, V data used + in normalization. """ Y = X + V @@ -85,12 +95,21 @@ def norm( return X, V, T, norm_dict -def bandwidth_rule_of_thumb(X, return_sigma=False): +def bandwidth_rule_of_thumb(X: np.ndarray, return_sigma: Optional[bool] = False) -> Union[Tuple[float, float], float]: """ This function computes a rule-of-thumb bandwidth for a Gaussian kernel based on: https://en.wikipedia.org/wiki/Kernel_density_estimation#A_rule-of-thumb_bandwidth_estimator + + The bandwidth is a free parameter that controls the level of smoothing in the estimated distribution. + + Args: + X: Current state. This corresponds to, for example, the spliced transcriptomic state. + return_sigma: Determines whether the standard deviation will be returned in addition to the bandwidth parameter + + Returns: + Either a tuple with the bandwith and standard deviation or just the bandwidth """ - sig = sig = np.sqrt(np.mean(np.diag(np.cov(X.T)))) + sig = np.sqrt(np.mean(np.diag(np.cov(X.T)))) h = 1.06 * sig / (len(X) ** (-1 / 5)) if return_sigma: return h, sig @@ -98,7 +117,7 @@ def bandwidth_rule_of_thumb(X, return_sigma=False): return h -def bandwidth_selector(X): +def bandwidth_selector(X: np.ndarray) -> float: """ This function computes an empirical bandwidth for a Gaussian kernel. """ @@ -122,7 +141,6 @@ def bandwidth_selector(X): d = np.mean(distances[:, 1:]) / 1.5 return np.sqrt(2) * d - def denorm( VecFld: Dict[str, Union[np.ndarray, None]], X_old: np.ndarray, @@ -139,8 +157,7 @@ def denorm( data. Returns: - return VecFld_denorm: An updated VecFld dictionary that includes denormalized X, Y, X_ctrl, grid, grid_V, V, - and the norm_dict key. + An updated VecFld dictionary that includes denormalized X, Y, X_ctrl, grid, grid_V, V, and the norm_dict key. """ Y_old = X_old + V_old xm, ym = norm_dict["xm"], norm_dict["ym"] @@ -183,32 +200,22 @@ def lstsq_solver(lhs, rhs, method="drouin"): return C -def get_P(Y, V, sigma2, gamma, a, div_cur_free_kernels=False): +def get_P( + Y: np.ndarray, V: np.ndarray, sigma2: float, gamma: float, a: float, div_cur_free_kernels: Optional[bool] = False +) -> Tuple[np.ndarray, np.ndarray]: """GET_P estimates the posterior probability and part of the energy. - Arguments - --------- - Y: 'np.ndarray' - Velocities from the data. - V: 'np.ndarray' - The estimated velocity: V=f(X), f being the vector field function. - sigma2: 'float' - sigma2 is defined as sum(sum((Y - V)**2)) / (N * D) - gamma: 'float' - Percentage of inliers in the samples. This is an inital value for EM iteration, and it is not important. - a: 'float' - Paramerter of the model of outliers. We assume the outliers obey uniform distribution, and the volume of + Args: + Y: Velocities from the data. + V: The estimated velocity: V=f(X), f being the vector field function. + sigma2: sigma2 is defined as sum(sum((Y - V)**2)) / (N * D) + gamma: Percentage of inliers in the samples. This is an inital value for EM iteration, and it is not important. + a: Parameter of the model of outliers. We assume the outliers obey uniform distribution, and the volume of outlier's variation space is a. - div_cur_free_kernels: `bool` (default: False) - A logic flag to determine whether the divergence-free or curl-free kernels will be used for learning the - vector field. + div_cur_free_kernels: A logic flag to determine whether the divergence-free or curl-free kernels will be used for learning the vector field. - Returns - ------- - P: 'np.ndarray' - Posterior probability, related to equation 27. - E: `np.ndarray' - Energy, related to equation 26. + Returns: + Tuple of (posterior probability, energy) related to equations 27 and 26 in the SparseVFC paper. """ @@ -228,22 +235,20 @@ def get_P(Y, V, sigma2, gamma, a, div_cur_free_kernels=False): @timeit def graphize_vecfld( - func, - X, + func: Callable, + X: np.ndarray, nbrs_idx=None, dist=None, - k=30, - distance_free=True, - n_int_steps=20, - cores=1, -): + k: int = 30, + distance_free: bool = True, + n_int_steps: int = 20, + cores: int = 1, +) -> Tuple[np.ndarray, Union[NNDescent, NearestNeighbors]]: n, d = X.shape nbrs = None if nbrs_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: - from pynndescent import NNDescent - nbrs = NNDescent( X, metric="euclidean", @@ -334,64 +339,44 @@ def SparseVFC( velocity_based_sampling: bool = True, sigma: float = 0.8, eta: float = 0.5, - seed=0, + seed: Union[int, np.ndarray] = 0, lstsq_method: str = "drouin", verbose: int = 1, -) -> dict: +) -> VecFldDict: """Apply sparseVFC (vector field consensus) algorithm to learn a functional form of the vector field from random samples with outlier on the entire space robustly and efficiently. (Ma, Jiayi, etc. al, Pattern Recognition, 2013) - Arguments - --------- - X: 'np.ndarray' - Current state. This corresponds to, for example, the spliced transcriptomic state. - Y: 'np.ndarray' - Velocity estimates in delta t. This corresponds to, for example, the inferred spliced transcriptomic + Args: + X: Current state. This corresponds to, for example, the spliced transcriptomic state. + Y: Velocity estimates in delta t. This corresponds to, for example, the inferred spliced transcriptomic velocity or total RNA velocity based on metabolic labeling data estimated calculated by dynamo. - Grid: 'np.ndarray' - Current state on a grid which is often used to visualize the vector field. This corresponds to, for example, + Grid: Current state on a grid which is often used to visualize the vector field. This corresponds to, for example, the spliced transcriptomic state or total RNA state. - M: 'int' (default: 100) - The number of basis functions to approximate the vector field. - a: 'float' (default: 10) - Parameter of the model of outliers. We assume the outliers obey uniform distribution, and the volume of + M: The number of basis functions to approximate the vector field. + a: Parameter of the model of outliers. We assume the outliers obey uniform distribution, and the volume of outlier's variation space is `a`. - beta: 'float' (default: 0.1) - Parameter of Gaussian Kernel, k(x, y) = exp(-beta*||x-y||^2). + beta: Parameter of Gaussian Kernel, k(x, y) = exp(-beta*||x-y||^2). If None, a rule-of-thumb bandwidth will be computed automatically. - ecr: 'float' (default: 1e-5) - The minimum limitation of energy change rate in the iteration process. - gamma: 'float' (default: 0.9) - Percentage of inliers in the samples. This is an initial value for EM iteration, and it is not important. - lambda_: 'float' (default: 3) - Represents the trade-off between the goodness of data fit and regularization. Larger Lambda_ put more + ecr: The minimum limitation of energy change rate in the iteration process. + gamma: Percentage of inliers in the samples. This is an initial value for EM iteration, and it is not important. + lambda_: Represents the trade-off between the goodness of data fit and regularization. Larger Lambda_ put more weights on regularization. - minP: 'float' (default: 1e-5) - The posterior probability Matrix P may be singular for matrix inversion. We set the minimum value of P as + minP: The posterior probability Matrix P may be singular for matrix inversion. We set the minimum value of P as minP. - MaxIter: 'int' (default: 500) - Maximum iteration times. - theta: 'float' (default: 0.75) - Define how could be an inlier. If the posterior probability of a sample is an inlier is larger than theta, + MaxIter: Maximum iteration times. + theta: Define how could be an inlier. If the posterior probability of a sample is an inlier is larger than theta, then it is regarded as an inlier. - div_cur_free_kernels: `bool` (default: False) - A logic flag to determine whether the divergence-free or curl-free kernels will be used for learning the + div_cur_free_kernels: A logic flag to determine whether the divergence-free or curl-free kernels will be used for learning the vector field. - sigma: 'int' (default: `0.8`) - Bandwidth parameter. - eta: 'int' (default: `0.5`) - Combination coefficient for the divergence-free or the curl-free kernels. - seed : int or 1-d array_like, optional (default: `0`) + sigma: Bandwidth parameter. + eta: Combination coefficient for the divergence-free or the curl-free kernels. + seed: int or 1-d array_like, optional (default: `0`) Seed for RandomState. Must be convertible to 32 bit unsigned integers. Used in sampling control points. Default is to be 0 for ensure consistency between different runs. - lstsq_method: 'str' (default: `drouin`) - The name of the linear least square solver, can be either 'scipy` or `douin`. - verbose: `int` (default: `1`) - The level of printing running information. - - Returns - ------- - VecFld: 'dict' + lstsq_method: The name of the linear least square solver, can be either 'scipy` or `douin`. + verbose: The level of printing running information. + + Returns: A dictionary which contains: X: Current state. valid_ind: The indices of cells that have finite velocity values. @@ -407,7 +392,7 @@ def SparseVFC( grid: Grid of current state. grid_V: Prediction of velocity of the grid. iteration: Number of the last iteration. - tecr_vec: Vector of relative energy changes rate comparing to previous step. + tecr_traj: Vector of relative energy changes rate comparing to previous step. E_traj: Vector of energy at each iteration, where V = f(X), P is the posterior probability and VFCIndex is the indexes of inliers found by sparseVFC. Note that V = `con_K(Grid, X_ctrl, beta).dot(C)` gives the prediction of velocity on Grid (but can also be any @@ -570,11 +555,20 @@ def SparseVFC( class BaseVectorField: + """The BaseVectorField class is a base class for storing and manipulating vector fields. A vector field is a function that associates a vector to each point in a certain space. + + The BaseVectorField class has a number of methods that allow you to work with vector fields. The __init__ method initializes the object, taking in a number of optional arguments such as X, V, and Grid, which correspond to the coordinates of the points in the vector field, the vector values at those points, and a grid used for evaluating the vector field, respectively. + + The construct_graph method takes in a set of coordinates X and returns a tuple consisting of a matrix of pairwise distances between the points in X and an object for performing nearest neighbor searches. The from_adata method takes in an AnnData object and a basis string, and extracts the coordinates and vector values of the vector field stored in the AnnData object. + + The get_X, get_V, and get_data methods return the coordinates, vector values, and both the coordinates and vector values of the vector field, respectively. The find_fixed_points method searches for fixed points of the vector field function, which are points where the velocity of the vector field is zero. The get_fixed_points method returns the fixed points and their types (stable or unstable). The plot method generates a plot of the vector field. + """ + def __init__( self, - X=None, - V=None, - Grid=None, + X: Optional[np.ndarray] = None, + V: Optional[np.ndarray] = None, + Grid: Optional[np.ndarray] = None, *args, **kwargs, ): @@ -584,36 +578,52 @@ def __init__( self.fixed_points = kwargs.pop("fixed_points", None) super().__init__(**kwargs) - def construct_graph(self, X=None, **kwargs): + def construct_graph( + self, + X: Optional[np.ndarray] = None, + **kwargs, + ) -> Tuple[np.ndarray, Union[NNDescent, NearestNeighbors]]: X = self.data["X"] if X is None else X return graphize_vecfld(self.func, X, **kwargs) - def from_adata(self, adata, basis="", vf_key="VecFld"): + def from_adata(self, adata: AnnData, basis: str = "", vf_key: str = "VecFld"): vf_dict, func = vecfld_from_adata(adata, basis=basis, vf_key=vf_key) self.data["X"] = vf_dict["X"] self.data["V"] = vf_dict["Y"] # use the raw velocity self.vf_dict = vf_dict self.func = func - def get_X(self, idx=None): + def get_X(self, idx: Optional[int] = None) -> np.ndarray: if idx is None: return self.data["X"] else: return self.data["X"][idx] - def get_V(self, idx=None): + def get_V(self, idx: Optional[int] = None) -> np.ndarray: if idx is None: return self.data["V"] else: return self.data["V"][idx] - def get_data(self): + def get_data(self) -> Tuple[np.ndarray, np.ndarray]: return self.data["X"], self.data["V"] - def find_fixed_points(self, n_x0=100, X0=None, domain=None, sampling_method="random", **kwargs): + def find_fixed_points( + self, + n_x0: int = 100, + X0: Optional[np.ndarray] = None, + domain=None, + sampling_method=Literal["random", "velocity", "trn", "kmeans"], + **kwargs, + ): """ Search for fixed points of the vector field function. + Args: + n_x0: Number of sampling points + X0: An array of shape (n_samples, n_dim) + domain: Domain in which to search for fixed points + sampling_method: """ if domain is not None: domain = np.atleast_2d(domain) @@ -638,19 +648,17 @@ def find_fixed_points(self, n_x0=100, X0=None, domain=None, sampling_method="ran X, J, _ = find_fixed_points(X0, self.func, domain=domain, **kwargs) self.fixed_points = FixedPoints(X, J) - def get_fixed_points(self, **kwargs): + def get_fixed_points(self, **kwargs) -> Tuple[np.ndarray, np.ndarray]: """ Get fixed points of the vector field function. - Returns - ------- - Xss: :class:`~numpy.ndarray` - Coordinates of the fixed points. - ftype: :class:`~numpy.ndarray` - Types of the fixed points: - -1 -- stable, - 0 -- saddle, - 1 -- unstable + Returns: + Tuple storing the coordinates of the fixed points and the types of the fixed points. + + Types of the fixed points: + -1 -- stable, + 0 -- saddle, + 1 -- unstable """ if self.fixed_points is None: self.find_fixed_points(**kwargs) @@ -659,8 +667,18 @@ def get_fixed_points(self, **kwargs): ftype = self.fixed_points.get_fixed_point_types() return Xss, ftype - def assign_fixed_points(self, domain=None, cores=1, **kwargs): - """assign each cell to the associated fixed points""" + def assign_fixed_points( + self, domain: Optional[np.ndarray] = None, cores: int = 1, **kwargs + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """assign each cell to the associated fixed points + + Args: + domain: Array of shape (n_dim, 2), which stores the domain for each dimension + cores: Defaults to 1. + + Returns: + Tuple of fixed points, type assignments, and assignment IDs + """ if domain is None and self.data is not None: domain = np.vstack((np.min(self.data["X"], axis=0), np.max(self.data["X"], axis=0))).T @@ -714,8 +732,8 @@ def assign_fixed_points(self, domain=None, cores=1, **kwargs): def integrate( self, - init_states, - dims=None, + init_states: np.ndarray, + dims: Optional[Union[int, list, np.ndarray]] = None, scale=1, t_end=None, step_size=None, @@ -726,7 +744,26 @@ def integrate( sampling="arc_length", verbose=False, disable=False, - ): + ) -> Tuple[np.ndarray, np.ndarray]: + """Integrate along a path through the vector field field function to predict the state after a certain amount of time t has elapsed. + + Args: + init_states: Initial state provided to scipy's ivp_solver with shape (num_cells, num_dim) + dims: Dimensions of state to be used + scale: Scale the vector field function by this factor. Defaults to 1. + t_end: Integrates up till when t=t_end, Defaults to None. + step_size: Defaults to None. + args: Additional arguments provided to scipy's ivp_solver Defaults to (). + integration_direction: Defaults to "forward". + interpolation_num: Defaults to 250. + average: Defaults to True. + sampling: Defaults to "arc_length". + verbose: Defaults to False. + disable: Defaults to False. + + Returns: + Tuple storing times and predictions + """ from ..prediction.utils import integrate_vf_ivp from ..tools.utils import getTend, getTseq @@ -760,16 +797,45 @@ def integrate( class DifferentiableVectorField(BaseVectorField): + @abstractmethod def get_Jacobian(self, method=None): - # subclasses must implement this function. - pass + raise NotImplementedError + + def compute_divergence(self, X: Optional[np.ndarray] = None, method: str = "analytical", **kwargs) -> np.ndarray: + """Takes the trace of the jacobian matrix to calculate the divergence. + + Args: + X: Current state. Defaults to None, initialized from self.data + method: Method for calculating the Jacobian, one of numerical, analytical, parallel - def compute_divergence(self, X=None, method="analytical", **kwargs): + Returns: + The divergence of the Jacobian matrix. + """ X = self.data["X"] if X is None else X f_jac = self.get_Jacobian(method=method) return compute_divergence(f_jac, X, **kwargs) - def compute_curl(self, X=None, method="analytical", dim1=0, dim2=1, dim3=2, **kwargs): + def compute_curl( + self, + X: Optional[np.ndarray] = None, + method: str = "analytical", + dim1: int = 0, + dim2: int = 1, + dim3: int = 2, + **kwargs, + ) -> np.ndarray: + """Curl computation for many samples for 2/3 D systems. + + Args: + X: Current state. Defaults to None, initialized from self.data + method: Method for calculating the Jacobian, one of numerical, analytical, parallel + dim1: index of first dimension + dim2: index of second dimension + dim3: index of third dimension + + Returns: + np.ndarray storing curl + """ X = self.data["X"] if X is None else X if dim3 is None or X.shape[1] < 3: X = X[:, [dim1, dim2]] @@ -778,75 +844,134 @@ def compute_curl(self, X=None, method="analytical", dim1=0, dim2=1, dim3=2, **kw f_jac = self.get_Jacobian(method=method, **kwargs) return compute_curl(f_jac, X, **kwargs) - def compute_acceleration(self, X=None, method="analytical", **kwargs): + def compute_acceleration(self, X: Optional[np.ndarray] = None, method: str = "analytical", **kwargs) -> np.ndarray: + """Calculate acceleration for many samples via + + .. math:: + a = || J \cdot v ||. + + Args: + X: Current state. Defaults to None, initialized from self.data + method: Method for calculating the Jacobian, one of numerical, analytical, parallel + + Returns: + np.ndarray storing the vector norm of acceleration (across all genes) for each cell + """ X = self.data["X"] if X is None else X f_jac = self.get_Jacobian(method=method) return compute_acceleration(self.func, f_jac, X, **kwargs) - def compute_curvature(self, X=None, method="analytical", formula=2, **kwargs): + def compute_curvature( + self, X: Optional[np.ndarray] = None, method: str = "analytical", formula: int = 2, **kwargs + ) -> np.ndarray: + """Calculate curvature for many samples via + + Formula 1: + .. math:: + \kappa = \frac{||\mathbf{v} \times \mathbf{a}||}{||\mathbf{V}||^3} + + Formula 2: + .. math:: + \kappa = \frac{||\mathbf{Jv} (\mathbf{v} \cdot \mathbf{v}) - ||\mathbf{v} (\mathbf{v} \cdot \mathbf{Jv})}{||\mathbf{V}||^4} + + Args: + X: Current state. Defaults to None, initialized from self.data + method: Method for calculating the Jacobian, one of numerical, analytical, parallel + formula: Choose between formulas 1 and 2 to compute the curvature. Defaults to 2. + + Returns: + np.ndarray storing the vector norm of curvature (across all genes) for each cell + """ X = self.data["X"] if X is None else X f_jac = self.get_Jacobian(method=method) return compute_curvature(self.func, f_jac, X, formula=formula, **kwargs) - def compute_torsion(self, X=None, method="analytical", **kwargs): + def compute_torsion(self, X: Optional[np.ndarray] = None, method: str = "analytical", **kwargs) -> np.ndarray: + """Calculate torsion for many samples via + + .. math:: + \tau = \frac{(\mathbf{v} \times \mathbf{a}) \cdot (\mathbf{J} \cdot \mathbf{a})}{||\mathbf{V} \times \mathbf{a}||^2} + + Args: + X: Current state. Defaults to None, initialized from self.data + method: Method for calculating the Jacobian, one of numerical, analytical, parallel + + Returns: + np.ndarray storing torsion for each sample + """ X = self.data["X"] if X is None else X f_jac = self.get_Jacobian(method=method) return compute_torsion(self.func, f_jac, X, **kwargs) - def compute_sensitivity(self, X=None, method="analytical", **kwargs): + def compute_sensitivity(self, X: Optional[np.ndarray] = None, method: str = "analytical", **kwargs) -> np.ndarray: + """Calculate sensitivity for many samples via + + .. math:: + S = (I - J)^{-1} D(\frac{1}{{I-J}^{-1}}) + + Args: + X: Current state. Defaults to None, initialized from self.data + method: Method for calculating the Jacobian, one of numerical, analytical, parallel Defaults to "analytical". + + Returns: + np.ndarray storing sensitivity matrix + """ X = self.data["X"] if X is None else X f_jac = self.get_Jacobian(method=method) return compute_sensitivity(f_jac, X, **kwargs) class SvcVectorField(DifferentiableVectorField): - def __init__(self, X=None, V=None, Grid=None, *args, **kwargs): + def __init__( + self, + X: Optional[np.ndarray] = None, + V: Optional[np.ndarray] = None, + Grid: Optional[np.ndarray] = None, + *args, + **kwargs, + ): """Initialize the VectorField class. - Parameters - ---------- - X: :class:`~numpy.ndarray` (dimension: n_obs x n_features) - Original data. - V: :class:`~numpy.ndarray` (dimension: n_obs x n_features) - Velocities of cells in the same order and dimension of X. - Grid: :class:`~numpy.ndarray` - The function that returns diffusion matrix which can be dependent on the variables (for example, genes) - M: `int` (default: None) - The number of basis functions to approximate the vector field. By default it is calculated as - `min(len(X), int(1500 * np.log(len(X)) / (np.log(len(X)) + np.log(100))))`. So that any datasets with less - than about 900 data points (cells) will use full data for vector field reconstruction while any dataset - larger than that will at most use 1500 data points. - a: `float` (default 5) - Parameter of the model of outliers. We assume the outliers obey uniform distribution, and the volume of - outlier's variation space is a. - beta: `float` (default: None) - Parameter of Gaussian Kernel, k(x, y) = exp(-beta*||x-y||^2). - If None, a rule-of-thumb bandwidth will be computed automatically. - ecr: `float` (default: 1e-5) - The minimum limitation of energy change rate in the iteration process. - gamma: `float` (default: 0.9) - Percentage of inliers in the samples. This is an inital value for EM iteration, and it is not important. - Default value is 0.9. - lambda_: `float` (default: 3) - Represents the trade-off between the goodness of data fit and regularization. - minP: `float` (default: 1e-5) - The posterior probability Matrix P may be singular for matrix inversion. We set the minimum value of P as - minP. - MaxIter: `int` (default: 500) - Maximum iteration times. - theta: `float` (default 0.75) - Define how could be an inlier. If the posterior probability of a sample is an inlier is larger than theta, - then it is regarded as an inlier. - div_cur_free_kernels: `bool` (default: False) - A logic flag to determine whether the divergence-free or curl-free kernels will be used for learning the - vector field. - sigma: `int` - Bandwidth parameter. - eta: `int` - Combination coefficient for the divergence-free or the curl-free kernels. - seed : int or 1-d array_like, optional (default: `0`) - Seed for RandomState. Must be convertible to 32 bit unsigned integers. Used in sampling control points. - Default is to be 0 for ensure consistency between different runs. + Args: + X: (dimension: n_obs x n_features), Original data. + V: (dimension: n_obs x n_features), Velocities of cells in the same order and dimension of X. + Grid: The function that returns diffusion matrix which can be dependent on the variables (for example, genes) + M: `int` (default: None) + The number of basis functions to approximate the vector field. By default it is calculated as + `min(len(X), int(1500 * np.log(len(X)) / (np.log(len(X)) + np.log(100))))`. So that any datasets with less + than about 900 data points (cells) will use full data for vector field reconstruction while any dataset + larger than that will at most use 1500 data points. + a: `float` (default 5) + Parameter of the model of outliers. We assume the outliers obey uniform distribution, and the volume of + outlier's variation space is a. + beta: `float` (default: None) + Parameter of Gaussian Kernel, k(x, y) = exp(-beta*||x-y||^2). + If None, a rule-of-thumb bandwidth will be computed automatically. + ecr: `float` (default: 1e-5) + The minimum limitation of energy change rate in the iteration process. + gamma: `float` (default: 0.9) + Percentage of inliers in the samples. This is an inital value for EM iteration, and it is not important. + Default value is 0.9. + lambda_: `float` (default: 3) + Represents the trade-off between the goodness of data fit and regularization. + minP: `float` (default: 1e-5) + The posterior probability Matrix P may be singular for matrix inversion. We set the minimum value of P as + minP. + MaxIter: `int` (default: 500) + Maximum iteration times. + theta: `float` (default 0.75) + Define how could be an inlier. If the posterior probability of a sample is an inlier is larger than theta, + then it is regarded as an inlier. + div_cur_free_kernels: `bool` (default: False) + A logic flag to determine whether the divergence-free or curl-free kernels will be used for learning the + vector field. + sigma: `int` + Bandwidth parameter. + eta: `int` + Combination coefficient for the divergence-free or the curl-free kernels. + seed : int or 1-d array_like, optional (default: `0`) + Seed for RandomState. Must be convertible to 32 bit unsigned integers. Used in sampling control points. + Default is to be 0 for ensure consistency between different runs. """ super().__init__(X, V, Grid) @@ -875,27 +1000,20 @@ def __init__(self, X=None, V=None, Grid=None, *args, **kwargs): self.norm_dict = {} - def train(self, normalize=False, **kwargs): + def train(self, normalize: bool = False, **kwargs) -> VecFldDict: """Learn an function of vector field from sparse single cell samples in the entire space robustly. Reference: Regularized vector field learning with sparse approximation for mismatch removal, Ma, Jiayi, etc. al, Pattern Recognition - Arguments - --------- - normalize: 'bool' (default: False) - Logic flag to determine whether to normalize the data to have zero means and unit covariance. This is + Args: + normalize: Logic flag to determine whether to normalize the data to have zero means and unit covariance. This is often required for raw dataset (for example, raw UMI counts and RNA velocity values in high dimension). But it is normally not required for low dimensional embeddings by PCA or other non-linear dimension reduction methods. - method: 'string' - Method that is used to reconstruct the vector field functionally. Currently only SparseVFC supported but - other improved approaches are under development. - - Returns - ------- - VecFld: `dict' - A dictionary which contains X, Y, beta, V, C, P, VFCIndex. Where V = f(X), P is the posterior - probability and VFCIndex is the indexes of inliers which found by VFC. + + Returns: + A dictionary which contains X, Y, beta, V, C, P, VFCIndex. Where V = f(X), P is the posterior + probability and VFCIndex is the indexes of inliers which found by VFC. """ if normalize: @@ -933,12 +1051,14 @@ def train(self, normalize=False, **kwargs): return self.vf_dict - def plot_energy(self, figsize=None, fig=None): + def plot_energy( + self, figsize: Optional[Tuple[float, float]] = None, fig: Optional[matplotlib.figure.Figure] = None + ): from ..plot.scVectorField import plot_energy plot_energy(None, vecfld_dict=self.vf_dict, figsize=figsize, fig=fig) - def get_Jacobian(self, method="analytical", input_vector_convention="row", **kwargs): + def get_Jacobian(self, method: str = "analytical", input_vector_convention: str = "row", **kwargs) -> np.ndarray: """ Get the Jacobian of the vector field function. If method is 'analytical': @@ -973,7 +1093,7 @@ def get_Jacobian(self, method="analytical", input_vector_convention="row", **kwa f"supports 'analytical', 'numerical', and 'parallel'." ) - def get_Hessian(self, method="analytical", **kwargs): + def get_Hessian(self, method: str = "analytical", **kwargs) -> np.ndarray: """ Get the Hessian of the vector field function. If method is 'analytical': @@ -997,7 +1117,7 @@ def get_Hessian(self, method="analytical", **kwargs): else: raise NotImplementedError(f"The method {method} is not implemented. Currently only supports 'analytical'.") - def get_Laplacian(self, method="analytical", **kwargs): + def get_Laplacian(self, method: str = "analytical", **kwargs) -> np.ndarray: """ Get the Laplacian of the vector field. Laplacian is defined as the sum of the diagonal of the Hessian matrix. Because Hessian is originally defined for scalar function and here we extend it to vector functions. We will @@ -1017,22 +1137,16 @@ def get_Laplacian(self, method="analytical", **kwargs): else: raise NotImplementedError(f"The method {method} is not implemented. Currently only supports 'analytical'.") - def evaluate(self, CorrectIndex, VFCIndex, siz): + def evaluate(self, CorrectIndex: List, VFCIndex: List, siz: int) -> Tuple[float, float, float]: """Evaluate the precision, recall, corrRate of the sparseVFC algorithm. - Arguments - --------- - CorrectIndex: 'List' - Ground truth indexes of the correct vector field samples. - VFCIndex: 'List' - Indexes of the correct vector field samples learned by VFC. - siz: 'int' - Number of initial matches. + Args: + CorrectIndex: Ground truth indexes of the correct vector field samples. + VFCIndex: Indexes of the correct vector field samples learned by VFC. + siz: Number of initial matches. - Returns - ------- - A tuple of precision, recall, corrRate: - Precision, recall, corrRate: Precision and recall of VFC, percentage of initial correct matches. + Returns: + A tuple of precision, recall, corrRate, where Precision, recall, corrRate are Precision and recall of VFC, percentage of initial correct matches, respectively. See also:: :func:`sparseVFC`. """ @@ -1058,8 +1172,30 @@ def evaluate(self, CorrectIndex, VFCIndex, siz): class KOVectorField(DifferentiableVectorField): def __init__( - self, X=None, V=None, Grid=None, K=None, func_base=None, fjac_base=None, PCs=None, mean=None, *args, **kwargs + self, + X: Optional[np.ndarray] = None, + V: Optional[np.ndarray] = None, + Grid=None, + K=None, + func_base: Optional[Callable] = None, + fjac_base: Optional[Callable] = None, + PCs: Optional[np.ndarray] = None, + mean: float = None, + *args, + **kwargs, ): + """_summary_ + + Args: + X: (dimension: n_obs x n_features), Original data. Defaults to None. + V: (dimension: n_obs x n_features), Velocities of cells in the same order and dimension of X. Defaults to None. + Grid: Grid of the current state. Defaults to None. + K: _description_. Defaults to None. + func_base: _description_. Defaults to None. + fjac_base: Callable passed to `Jacobian_kovf` to generate the Jacobian. Defaults to None. + PCs: The PCA loading matrix of dimensions d x k, where d is the number of dimensions of the original space. Defaults to None. + mean: _description_. Defaults to None. + """ super().__init__(X, V, Grid=Grid, *args, **kwargs) if K.ndim == 2: @@ -1074,6 +1210,10 @@ def __init__( self.setup_perturbed_func() def setup_perturbed_func(self): + """ + Reference "In silico perturbation to predict gene-wise perturbation effects and cell fate diversions" in the methods section + """ + def vf_func_perturb(x): x_gene = np.abs(x @ self.PCs.T + self.mean) v_gene = vector_transformation(self.func_base(x), self.PCs) @@ -1251,7 +1391,15 @@ def vector_field_function_knockout( class BifurcationTwoGenesVectorField(DifferentiableVectorField): - def __init__(self, param_dict, X=None, V=None, Grid=None, *args, **kwargs): + def __init__( + self, + param_dict, + X: Optional[np.ndarray] = None, + V: Optional[np.ndarray] = None, + Grid: Optional[np.ndarray] = None, + *args, + **kwargs, + ): super().__init__(X, V, Grid, *args, **kwargs) param_dict_ = param_dict.copy() for k in param_dict_.keys(): @@ -1264,7 +1412,7 @@ def __init__(self, param_dict, X=None, V=None, Grid=None, *args, **kwargs): def get_Jacobian(self, method=None): return lambda x: jacobian_bifur2genes(x, **self.vf_dict["params"]) - def find_fixed_points(self, n_x0=10, **kwargs): + def find_fixed_points(self, n_x0: int = 10, **kwargs): a = self.vf_dict["params"]["a"] b = self.vf_dict["params"]["b"] gamma = self.vf_dict["params"]["gamma"] diff --git a/dynamo/vectorfield/stochastic_process.py b/dynamo/vectorfield/stochastic_process.py index d89a627b0..538d8e45f 100644 --- a/dynamo/vectorfield/stochastic_process.py +++ b/dynamo/vectorfield/stochastic_process.py @@ -1,55 +1,45 @@ +from typing import List, Optional + import numpy as np +from anndata import AnnData from sklearn.neighbors import NearestNeighbors from tqdm import tqdm from ..tools.connectivity import _gen_neighbor_keys, check_and_recompute_neighbors from ..tools.utils import log1p_ -from .utils import vecfld_from_adata, vector_field_function +from .utils import VecFldDict, vecfld_from_adata, vector_field_function def diffusionMatrix( - adata, - X_data=None, - V_data=None, - genes=None, - layer=None, - basis="umap", - dims=None, - n=30, - VecFld=None, - residual="vector_field", -): + adata: AnnData, + X_data: Optional[np.ndarray] = None, + V_data: Optional[np.ndarray] = None, + genes: Optional[List] = None, + layer: Optional[str] = None, + basis: str = "umap", + dims: Optional[List] = None, + n: int = 30, + VecFld: Optional[VecFldDict] = None, + residual: str = "vector_field", +) -> AnnData: """Calculate the diffusion matrix from the estimated velocity vector and the reconstructed vector field. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - an Annodata object. - X_data: `np.ndarray` (default: `None`) - The user supplied expression (embedding) data that will be used for calculating diffusion matrix directly. - V_data: `np.ndarray` (default: `None`) - The user supplied velocity data that will be used for calculating diffusion matrix directly. - genes: `list` or None (default: `None`) - The list of genes that will be used to subset the data. If `None`, all genes will be used. - layer: `str` or None (default: None) - Which layer of the data will be used for diffusion matrix calculation. - basis: `str` (default: `umap`) - Which basis of the data will be used for diffusion matrix calculation. - dims: `list` or None (default: `None`) - The list of dimensions that will be selected for diffusion matrix calculation. If `None`, all dimensions will be used. - n: `int` (default: `10`) - Number of nearest neighbors when the nearest neighbor graph is not included. - VecFld: `dictionary` or None (default: None) - The reconstructed vector field function. - residual: `str` or None (default: `vector_field`) - Method to calculate residual velocity vectors for diffusion matrix calculation. If `average`, all velocity + Args: + adata: an Annodata object. + X_data: The user supplied expression (embedding) data that will be used for calculating diffusion matrix directly. + V_data: The user supplied velocity data that will be used for calculating diffusion matrix directly. + genes: The list of genes that will be used to subset the data. If `None`, all genes will be used. + layer: Which layer of the data will be used for diffusion matrix calculation. + basis: Which basis of the data will be used for diffusion matrix calculation. + dims: The list of dimensions that will be selected for diffusion matrix calculation. If `None`, all dimensions will be used. + n: Number of nearest neighbors when the nearest neighbor graph is not included. + VecFld: The reconstructed vector field function. + residual: Method to calculate residual velocity vectors for diffusion matrix calculation. If `average`, all velocity of the nearest neighbor cells will be minused by its average velocity; if `vector_field`, all velocity will be minused by the predicted velocity from the reconstructed deterministic velocity vector field. - Returns - ------- - adata: :class:`~anndata.AnnData` - `AnnData` object that is updated with the `diffusion_matrix` key in the `uns` attribute which is a list of + Returns: + adata: `AnnData` object that is updated with the `diffusion_matrix` key in the `uns` attribute which is a list of the diffusion matrix for each cell. A column `diffusion` corresponds to the square root of the sum of all elements for each cell's diffusion matrix will also be added. """ @@ -200,18 +190,15 @@ def diffusionMatrix( adata.uns["diffusion_matrix"] = dmatrix -def diffusionMatrix2D(V_mat): +def diffusionMatrix2D(V_mat: np.ndarray) -> np.ndarray: """Function to calculate cell-specific diffusion matrix for based on velocity vectors of neighbors. This function works for two dimension. See :func:`diffusionMatrix` for generalization to arbitrary dimensions. - Parameters - ---------- - V_mat: `np.ndarray` - velocity vectors of neighbors + Args: + V_mat: velocity vectors of neighbors - Returns - ------- + Returns: Return the cell-specific diffusion matrix See also:: :func:`diffusionMatrix` diff --git a/dynamo/vectorfield/topography.py b/dynamo/vectorfield/topography.py index 30c63de03..fb28faaab 100755 --- a/dynamo/vectorfield/topography.py +++ b/dynamo/vectorfield/topography.py @@ -2,11 +2,12 @@ import datetime import os import warnings -from typing import Union +from typing import Callable, List, Optional, Tuple, Union import anndata import numpy as np import scipy.sparse as sp +from anndata import AnnData from scipy.integrate import odeint from scipy.linalg import eig from scipy.optimize import fsolve @@ -18,6 +19,7 @@ from .FixedPoints import FixedPoints from .scVectorField import BaseVectorField, SvcVectorField from .utils import ( + VecFldDict, angle, dynode_vector_field_function, find_fixed_points, @@ -26,10 +28,20 @@ vecfld_from_adata, vector_field_function, ) -from .vector_calculus import curl, divergence -def pac_onestep(x0, func, v0, ds=0.01): +def pac_onestep(x0: np.ndarray, func: Callable, v0: np.ndarray, ds: float = 0.01): + """One step of the predictor-corrector method + + Args: + x0: current value + func: function to be integrated + v0: tangent predictor + ds: step size, Defaults to 0.01. + + Returns: + x1: next value + """ x01 = x0 + v0 * ds def F(x): @@ -39,7 +51,29 @@ def F(x): return x1 -def continuation(x0, func, s_max, ds=0.01, v0=None, param_axis=0, param_direction=1): +def continuation( + x0: np.ndarray, + func: Callable, + s_max: float, + ds: float = 0.01, + v0: Optional[np.ndarray] = None, + param_axis: int = 0, + param_direction: int = 1, +) -> np.ndarray: + """Continually integrate the ODE `func` from x0 + + Args: + x0: initial value + func: function to be integrated + s_max: maximum integration length + ds: step size, Defaults to 0.01. + v0: initial tangent vector, Defaults to None. + param_axis: axis of the parameter, Defaults to 0. + param_direction: direction of the parameter, Defaults to 1. + + Returns: + np.ndarray of values along the curve + """ ret = [x0] if v0 is None: # initialize tangent predictor v = np.zeros_like(x0) @@ -59,7 +93,19 @@ def continuation(x0, func, s_max, ds=0.01, v0=None, param_axis=0, param_directio return np.array(ret) -def clip_curves(curves, domain, tol_discont=None): +def clip_curves( + curves: Union[List[List], List[np.ndarray]], domain: np.ndarray, tol_discont=None +) -> Union[List[List], List[np.ndarray]]: + """Clip curves to the domain + + Args: + curves: list of curves + domain: domain of the curves of dimension n x 2 + tol_discont: tolerance for discontinuity, Defaults to None. + + Returns: + list of clipped curves joined together + """ ret = [] for cur in curves: clip_away = np.zeros(len(cur), dtype=bool) @@ -87,7 +133,29 @@ def clip_curves(curves, domain, tol_discont=None): return ret -def compute_nullclines_2d(X0, fdx, fdy, x_range, y_range, s_max=None, ds=None): +def compute_nullclines_2d( + X0: Union[List, np.ndarray], + fdx: Callable, + fdy: Callable, + x_range: List, + y_range: List, + s_max: Optional[float] = None, + ds: Optional[float] = None, +) -> Tuple[List]: + """Compute nullclines of a 2D vector field. Nullclines are curves along which vector field is zero in either the x or y direction. + + Args: + X0: initial value + fdx: differential equation for x + fdy: differential equation for y + x_range: range of x + y_range: range of y + s_max: maximum integration length, Defaults to None. + ds: step size, Defaults to None. + + Returns: + Tuple of nullclines in x and y + """ if s_max is None: s_max = 5 * ((x_range[1] - x_range[0]) + (y_range[1] - y_range[0])) if ds is None: @@ -110,7 +178,31 @@ def compute_nullclines_2d(X0, fdx, fdy, x_range, y_range, s_max=None, ds=None): return NCx, NCy -def compute_separatrices(Xss, Js, func, x_range, y_range, t=50, n_sample=500, eps=1e-6): +def compute_separatrices( + Xss: np.ndarray, + Js: np.ndarray, + func: Callable, + x_range: List, + y_range: List, + t: int = 50, + n_sample: int = 500, + eps: float = 1e-6, +) -> List: + """Compute separatrix based on jacobians at points in `Xss` + + Args: + Xss: list of steady states + Js: list of jacobians at steady states + func: function to be integrated + x_range: range of x + y_range: range of y + t: integration time, Defaults to 50. + n_sample: number of samples, Defaults to 500. + eps: tolerance for discontinuity, Defaults to 1e-6. + + Returns: + list of separatrices + """ ret = [] for i, x in enumerate(Xss): print(x) @@ -137,7 +229,16 @@ def compute_separatrices(Xss, Js, func, x_range, y_range, t=50, n_sample=500, ep return ret -def set_test_points_on_curve(curve, interval): +def set_test_points_on_curve(curve: List[np.ndarray], interval: float) -> np.ndarray: + """Generates an np.ndarray of test points that are spaced out by `interval` distance + + Args: + curve: list of points + interval: distance for separation + + Returns: + np.ndarray of test points + """ P = [curve[0]] dist = 0 for i in range(1, len(curve)): @@ -148,7 +249,17 @@ def set_test_points_on_curve(curve, interval): return np.array(P) -def find_intersection_2d(curve1, curve2, tol_redundant=1e-4): +def find_intersection_2d(curve1: List[np.ndarray], curve2: List[np.ndarray], tol_redundant: float = 1e-4) -> np.ndarray: + """Compute intersections between curve 1 and curve2 + + Args: + curve1: list of points + curve2: list of points + tol_redundant: Defaults to 1e-4. + + Returns: + np.ndarray of intersection points between curve1 and curve2 + """ P = [] for i in range(len(curve1) - 1): for j in range(len(curve2) - 1): @@ -167,7 +278,25 @@ def find_intersection_2d(curve1, curve2, tol_redundant=1e-4): return np.array(P) -def find_fixed_points_nullcline(func, NCx, NCy, sample_interval=0.5, tol_redundant=1e-4): +def find_fixed_points_nullcline( + func: Callable, + NCx: List[List[np.ndarray]], + NCy: List[List[np.ndarray]], + sample_interval: float = 0.5, + tol_redundant: float = 1e-4, +) -> Tuple[np.ndarray, np.ndarray]: + """Find fixed points by computing the intersections of x and y nullclines using `find_intersection_2d` and passing these intersection points as samppling points to `find_fixed_points`. + + Args: + func: Callable passed to `find_fixed_points` along with the intersection points of the two nullclines + NCx: List of x nullcline + NCy: List of y nullcline + sample_interval: Interval for sampling test points along x and y nullclines. Defaults to 0.5. + tol_redundant: Defaults to 1e-4. + + Returns: + A tuple with solutions for where func(x) = 0 and the Jacobian matrix + """ test_Px = [] for i in range(len(NCx)): test_Px.append(set_test_points_on_curve(NCx[i], sample_interval)) @@ -195,7 +324,18 @@ def calc_fft(x): return xFFT[: int(n / 2)], freq -def dup_osc_idx(x, n_dom=3, tol=0.05): +def dup_osc_idx(x: np.ndarray, n_dom: int = 3, tol: float = 0.05): + """ + Find the index of the end of the first division in an array where the oscillatory patterns of two consecutive divisions are similar within a given tolerance. + + Args: + x: An array-like object containing the data to be analyzed. + n_dom: An integer specifying the number of divisions to make in the array. Defaults to 3. + tol: A float specifying the tolerance for considering the oscillatory patterns of two divisions to be similar. Defaults to 0.05. + + Returns: + A tuple containing the index of the end of the first division and the difference between the FFTs of the two divisions. If the oscillatory patterns of the two divisions are not similar within the given tolerance, returns (None, None). + """ l_int = int(np.floor(len(x) / n_dom)) ind_a, ind_b = np.arange((n_dom - 2) * l_int, (n_dom - 1) * l_int), np.arange((n_dom - 1) * l_int, n_dom * l_int) y1 = x[ind_a] @@ -223,7 +363,17 @@ def calc_fft_k(x): return idx, diff -def dup_osc_idx_iter(x, max_iter=5, **kwargs): +def dup_osc_idx_iter(x: np.ndarray, max_iter: int = 5, **kwargs) -> Tuple[int, np.ndarray]: + """ + Find the index of the end of the first division in an array where the oscillatory patterns of two consecutive divisions are similar within a given tolerance, using iterative search. + + Args: + x: An array-like object containing the data to be analyzed. + max_iter: An integer specifying the maximum number of iterations to perform. Defaults to 5. + + Returns: + A tuple containing the index of the end of the first division and an array of differences between the FFTs of consecutive divisions. If the oscillatory patterns of the two divisions are not similar within the given tolerance after the maximum number of iterations, returns the index and array from the final iteration. + """ stop = False idx = len(x) j = 0 @@ -245,7 +395,29 @@ def dup_osc_idx_iter(x, max_iter=5, **kwargs): # TODO: This should be inherited from the BaseVectorField/DifferentiatiableVectorField class, # and BifurcationTwoGenes should be inherited from this class. class VectorField2D: - def __init__(self, func, func_vx=None, func_vy=None, X_data=None): + """ + The VectorField2D class is a class that represents a 2D vector field, which is a type of mathematical object that assigns a 2D vector to each point in a 2D space. This vector field can be defined using a function that returns the vector at each point, or by separate functions for the x and y components of the vector. + + The class also has several methods for finding fixed points (points where the vector is zero) in the vector field, as well as for querying the fixed points that have been found. The `find_fixed_points_by_sampling` method uses sampling to find fixed points within a specified range in the x and y dimensions. It does this by generating a set of random or Latin Hypercube Sampled (LHS) points within the specified range, and then using the `find_fixed_points` function to find the fixed points that are closest to these points. The `find_fixed_points function` uses an iterative method to find fixed points, starting from an initial guess and using the Jacobian matrix at each point to update the guess until the fixed point is found to within a certain tolerance. + + The `get_Xss_confidence` method estimates the confidence of the fixed points by computing the mean distance of each fixed point to its nearest + neighbors in the data used to define the vector field. It returns an array of confidence values for each fixed point, with higher values indicating higher confidence. + """ + + def __init__( + self, + func: Callable, + func_vx: Optional[Callable] = None, + func_vy: Optional[Callable] = None, + X_data: Optional[np.ndarray] = None, + ): + """ + Args: + func: a function that takes an (n, 2) array of coordinates and returns an (n, 2) array of vectors + func_vx: a function that takes an (n, 2) array of coordinates and returns an (n,) array of x components of the vectors, Defaults to None. + func_vy: a function that takes an (n, 2) array of coordinates and returns an (n,) array of y components of the vectors, Defaults to None. + X_data: Defaults to None. + """ self.func = func def func_dim(x, func, dim): @@ -269,10 +441,27 @@ def func_dim(x, func, dim): self.NCx = None self.NCy = None - def get_num_fixed_points(self): + def get_num_fixed_points(self) -> int: + """ + Get the number of fixed points stored in the `Xss` attribute. + + Returns: + int: the number of fixed points + """ return len(self.Xss.get_X()) - def get_fixed_points(self, get_types=True): + def get_fixed_points(self, get_types: Optional[bool] = True) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray]: + """ + Get the fixed points stored in the `Xss` attribute, along with their types (stable, saddle, or unstable) if `get_types` is `True`. + + Args: + get_types: whether to include the types of the fixed points. Defaults to `True`. + + Returns: + tuple: a tuple containing: + - X (np.array): an (n, 2) array of coordinates of the fixed points + - ftype (np.array): an (n,) array of the types of the fixed points (-1 for stable, 0 for saddle, 1 for unstable). Only returned if `get_types` is `True`. + """ X = self.Xss.get_X() if not get_types: return X @@ -287,7 +476,15 @@ def get_fixed_points(self, get_types=True): ftype[i] = -1 return X, ftype - def get_Xss_confidence(self, k=50): + def get_Xss_confidence(self, k: Optional[int] = 50) -> np.ndarray: + """Get the confidence of each fixed point stored in the `Xss` attribute. + + Args: + k: the number of nearest neighbors to consider for each fixed point. Defaults to 50. + + Returns: + an (n,) array of confidences for the fixed points + """ X = self.X_data X = X.A if sp.issparse(X) else X Xss = self.Xss.get_X() @@ -317,7 +514,24 @@ def get_Xss_confidence(self, k=50): confidence /= np.max(confidence) return confidence[:-1] - def find_fixed_points_by_sampling(self, n, x_range, y_range, lhs=True, tol_redundant=1e-4): + def find_fixed_points_by_sampling( + self, + n: int, + x_range: Tuple[float, float], + y_range: Tuple[float, float], + lhs: Optional[bool] = True, + tol_redundant: float = 1e-4, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Find fixed points by sampling the vector field within a specified range of coordinates. + + Args: + n: the number of samples to take + x_range: a tuple of two floats specifying the range of x coordinates to sample + y_range: a tuple of two floats specifying the range of y coordinates to sample + lhs: whether to use Latin Hypercube Sampling to generate the samples. Defaults to `True`. + tol_redundant: the tolerance for removing redundant fixed points. Defaults to 1e-4. + """ if lhs: from ..tools.sampling import lhsclassic @@ -335,12 +549,37 @@ def find_fixed_points_by_sampling(self, n, x_range, y_range, lhs=True, tol_redun if len(X) > 0: self.Xss.add_fixed_points(X, J, tol_redundant) - def find_nearest_fixed_point(self, x, x_range, y_range, tol_redundant=1e-4): + def find_nearest_fixed_point( + self, x: np.ndarray, x_range: Tuple[float, float], y_range: Tuple[float, float], tol_redundant: float = 1e-4 + ): + """Find the fixed point closest to a given initial guess within a given range. + + Args: + x: an array specifying the initial guess + x_range: a tuple of two floats specifying the range of x coordinates + y_range: a tuple of two floats specifying the range of y coordinates + tol_redundant: the tolerance for removing redundant fixed points. Defaults to 1e-4. + """ X, J, _ = find_fixed_points(x, self.func, domain=[x_range, y_range], tol_redundant=tol_redundant) if len(X) > 0: self.Xss.add_fixed_points(X, J, tol_redundant) - def compute_nullclines(self, x_range, y_range, find_new_fixed_points=False, tol_redundant=1e-4): + def compute_nullclines( + self, + x_range: Tuple[float, float], + y_range: Tuple[float, float], + find_new_fixed_points: Optional[bool] = False, + tol_redundant: Optional[float] = 1e-4, + ): + """Compute nullclines. Nullclines are curves along which vector field is zero along a particular dimension. + + Args: + x_range: range of x + y_range: range of y + find_new_fixed_points: whether to find new fixed points along the nullclines and add to `self.Xss`. Defaults to False. + s_max: maximum integration length, Defaults to None. + ds: step size, Defaults to None. + """ # compute arguments s_max = 5 * ((x_range[1] - x_range[0]) + (y_range[1] - y_range[0])) ds = s_max / 1e3 @@ -359,6 +598,8 @@ def compute_nullclines(self, x_range, y_range, find_new_fixed_points=False, tol_ outside = is_outside(X, [x_range, y_range]) self.Xss.add_fixed_points(X[~outside], J[~outside], tol_redundant) + # TODO Refactor dict_vf + def output_to_dict(self, dict_vf): dict_vf["NCx"] = self.NCx dict_vf["NCy"] = self.NCy @@ -368,7 +609,37 @@ def output_to_dict(self, dict_vf): return dict_vf -def util_topology(adata, basis, X, dims, func, VecFld, n=25, **kwargs): +def util_topology( + adata: AnnData, + basis: str, + dims: Tuple[int, int], + func: Callable, + VecFld: VecFldDict, + X: Optional[np.ndarray] = None, + n: Optional[int] = 25, + **kwargs, +): + """A function that computes nullclines and fixed points defined by the function func. + + Args: + adata: `AnnData` object containing cell state information. + basis: A string specifying the reduced dimension embedding to use for the computation. + dims: A tuple of two integers specifying the dimensions of X to consider. + func: A vector-valued function taking in coordinates and returning the vector field. + VecFld: `VecFldDict` TypedDict storing information about the vector field and SparseVFC-related parameters and computations. + X: an alternative to providing an `AnnData` object. Provide an np.ndarray from which `dims` are accessed, Defaults to None. + n: An optional integer specifying the number of points to use for computing fixed points. Defaults to 25. + + Returns: + A tuple consisting of the following elements: + - X_basis: an array of shape (n, 2) where n is the number of points in X. This is the subset of X consisting of the first two dimensions specified by dims. If X is not provided, X_basis is taken from the obsm attribute of adata using the key "X_" + basis. + - xlim, ylim: a tuple of floats specifying the limits of the x and y axes, respectively. These are computed based on the minimum and maximum values of X_basis. + - confidence: an array of shape (n, ) containing the confidence scores of the fixed points. + - NCx, NCy: arrays of shape (n, ) containing the x and y coordinates of the nullclines (lines where the derivative of the system is zero), respectively. + - Xss: an array of shape (n, k) where k is the number of dimensions of the system, containing the fixed points. + - ftype: an array of shape (n, ) containing the types of fixed points (attractor, repeller, or saddle). + - an array of shape (n, ) containing the indices of the fixed points in the original data. + """ X_basis = adata.obsm["X_" + basis][:, dims] if X is None else X[:, dims] if X_basis.shape[1] == 2: @@ -410,44 +681,31 @@ def util_topology(adata, basis, X, dims, func, VecFld, n=25, **kwargs): def topography( - adata, - basis="umap", - layer=None, - X=None, - dims=None, - n=25, - VecFld=None, + adata: AnnData, + basis: Optional[str] = "umap", + layer: Optional[str] = None, + X: Optional[np.ndarray] = None, + dims: Optional[list] = None, + n: Optional[int] = 25, + VecFld: Optional[VecFldDict] = None, **kwargs, -): +) -> AnnData: """Map the topography of the single cell vector field in (first) two dimensions. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - an Annodata object. - basis: `str` (default: `trimap`) - The reduced dimension embedding of cells to visualize. - layer: `str` or None (default: None) - Which layer of the data will be used for vector field function reconstruction. This will be used in - conjunction with X. - X: 'np.ndarray' (dimension: n_obs x n_features) - Original data. Not used - dims: `list` or `None` (default: `None`) - The dimensions that will be used for vector field reconstruction. - n: `int` (default: `10`) - Number of samples for calculating the fixed points. - VecFld: `dictionary` or None (default: None) - The reconstructed vector field function. - kwargs: - Key word arguments passed to the find_fixed_point function of the vector field class for high dimension - fixed point identification. - - Returns - ------- - adata: :class:`~anndata.AnnData` - `AnnData` object that is updated with the `VecFld` or 'VecFld_' + basis dictionary in the `uns` attribute. - The `VecFld2D` key stores an instance of the VectorField2D class which presumably has fixed points, - nullcline, separatrix, computed and stored. + Args: + adata: an AnnData object. + basis: The reduced dimension embedding of cells to visualize. + layer: Which layer of the data will be used for vector field function reconstruction. This will be used in conjunction with X. + X: Original data. Not used + dims: The dimensions that will be used for vector field reconstruction. + n: Number of samples for calculating the fixed points. + VecFld: The reconstructed vector field function. + kwargs: Key word arguments passed to the find_fixed_point function of the vector field class for high dimension + fixed point identification. + + Returns: + `AnnData` object that is updated with the `VecFld` or 'VecFld_' + basis dictionary in the `uns` attribute. + The `VecFld2D` key stores an instance of the VectorField2D class which presumably has fixed points, nullcline, separatrix, computed and stored. """ if VecFld is None: @@ -518,106 +776,70 @@ def func(x): def VectorField( adata: anndata.AnnData, - basis: Union[None, str] = None, - layer: Union[None, str] = None, - dims: Union[int, list, None] = None, - genes: Union[list, None] = None, - normalize: bool = False, + basis: Optional[str] = None, + layer: Optional[str] = None, + dims: Optional[Union[int, list]] = None, + genes: Optional[list] = None, + normalize: Optional[bool] = False, grid_velocity: bool = False, grid_num: int = 50, velocity_key: str = "velocity_S", method: str = "SparseVFC", min_vel_corr: float = 0.6, restart_num: int = 5, - restart_seed: Union[None, list] = [0, 100, 200, 300, 400], - model_buffer_path: Union[str, None] = None, + restart_seed: Optional[list] = [0, 100, 200, 300, 400], + model_buffer_path: Optional[str] = None, return_vf_object: bool = False, map_topography: bool = False, pot_curl_div: bool = False, cores: int = 1, - result_key: Union[str, None] = None, + result_key: Optional[str] = None, copy: bool = False, **kwargs, ) -> Union[anndata.AnnData, BaseVectorField]: """Learn a function of high dimensional vector field from sparse single cell samples in the entire space robustly. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains embedding and velocity data - basis: - The embedding data to use. The vector field function will be learned on the low dimensional embedding and - can be then projected back to the high dimensional space. - layer: - Which layer of the data will be used for vector field function reconstruction. The layer once provided, will - override the `basis` argument and then learn the vector field function in high dimensional space. - dims: - The dimensions that will be used for reconstructing vector field functions. If it is an `int` all dimension - from the first dimension to `dims` will be used; if it is a list, the dimensions in the list will be used. - genes: - The gene names whose gene expression will be used for vector field reconstruction. By default (when genes is - set to None), the genes used for velocity embedding (var.use_for_transition) will be used for vector field - reconstruction. Note that the genes to be used need to have velocity calculated. - normalize: - Logic flag to determine whether to normalize the data to have zero means and unit covariance. This is often - required for raw dataset (for example, raw UMI counts and RNA velocity values in high dimension). But it is - normally not required for low dimensional embeddings by PCA or other non-linear dimension reduction methods. - grid_velocity: - Whether to generate grid velocity. Note that by default it is set to be False, but for datasets with - embedding dimension less than 4, the grid velocity will still be generated. Please note that number of total - grids in the space increases exponentially as the number of dimensions increases. So it may quickly lead to - lack of memory, for example, it cannot allocate the array with grid_num set to be 50 and dimension is 6 - (50^6 total grids) on 32 G memory computer. Although grid velocity may not be generated, the vector field - function can still be learned for thousands of dimensions and we can still predict the transcriptomic cell - states over long time period. - grid_num: - The number of grids in each dimension for generating the grid velocity. - velocity_key: - The key from the adata layer that corresponds to the velocity matrix. - method: - Method that is used to reconstruct the vector field functionally. Currently only SparseVFC supported but - other improved approaches are under development. - min_vel_corr: - The minimal threshold for the cosine correlation between input velocities and learned velocities to consider - as a successful vector field reconstruction procedure. If the cosine correlation is less than this - threshold and restart_num > 1, `restart_num` trials will be attempted with different seeds to reconstruct - the vector field function. This can avoid some reconstructions to be trapped in some local optimal. - restart_num: - The number of retrials for vector field reconstructions. - restart_seed: - A list of seeds for each retrial. Must be the same length as `restart_num` or None. - buffer_path: - The directory address keeping all the saved/to-be-saved torch variables and NN modules. When `method` is - set to be `dynode`, buffer_path will set to be - return_vf_object: - Whether or not to include an instance of a vectorfield class in the the `VecFld` dictionary in the `uns` - attribute. - map_topography: - Whether to quantify the topography of vector field. Note that for higher than 2D vector field, we can only - identify fixed points as high-dimensional nullcline and separatrices are mathematically difficult to be - identified. Nullcline and separatrices will also be a surface or manifold in high-dimensional vector field. - pot_curl_div: - Whether to calculate potential, curl or divergence for each cell. Potential can be calculated for any basis - while curl and divergence is by default only applied to 2D basis. However, divergence is applicable for any - dimension while curl is generally only defined for 2/3 D systems. - cores: - Number of cores to run the ddhodge function. If cores is set to be > 1, multiprocessing will be used to + Args: + adata: AnnData object that contains embedding and velocity data + basis: The embedding data to use. The vector field function will be learned on the low dimensional embedding and can be then projected + back to the high dimensional space. + layer: Which layer of the data will be used for vector field function reconstruction. The layer once provided, will override the `basis` + argument and then learn the vector field function in high dimensional space. + dims: The dimensions that will be used for reconstructing vector field functions. If it is an `int` all dimension from the first + dimension to `dims` will be used; if it is a list, the dimensions in the list will be used. + genes: The gene names whose gene expression will be used for vector field reconstruction. By default (when genes is set to None), the genes + used for velocity embedding (var.use_for_transition) will be used for vector field reconstruction. Note that the genes to be used need to have velocity calculated. + normalize: Logic flag to determine whether to normalize the data to have zero means and unit covariance. This is often required for raw + dataset (for example, raw UMI counts and RNA velocity values in high dimension). But it is normally not required for low dimensional embeddings by PCA or other non-linear dimension reduction methods. + grid_velocity: Whether to generate grid velocity. Note that by default it is set to be False, but for datasets with embedding dimension + less than 4, the grid velocity will still be generated. Please note that number of total grids in the space increases exponentially as the number of dimensions increases. So it may quickly lead to lack of memory, for example, it cannot allocate the array with grid_num set to be 50 and dimension is 6 (50^6 total grids) on 32 G memory computer. Although grid velocity may not be generated, the vector field function can still be learned for thousands of dimensions and we can still predict the transcriptomic cell states over long time period. + grid_num: The number of grids in each dimension for generating the grid velocity. + velocity_key: The key from the adata layer that corresponds to the velocity matrix. + method: Method that is used to reconstruct the vector field functionally. Currently only SparseVFC supported but other improved approaches + are under development. + min_vel_corr: The minimal threshold for the cosine correlation between input velocities and learned velocities to consider as a successful + vector field reconstruction procedure. If the cosine correlation is less than this threshold and restart_num > 1, `restart_num` trials will be attempted with different seeds to reconstruct the vector field function. This can avoid some reconstructions to be trapped in some local optimal. + restart_num: The number of retrials for vector field reconstructions. + restart_seed: A list of seeds for each retrial. Must be the same length as `restart_num` or None. + buffer_path: The directory address keeping all the saved/to-be-saved torch variables and NN modules. When `method` is set to be `dynode`, + buffer_path will set to be + return_vf_object: Whether or not to include an instance of a vectorfield class in the the `VecFld` dictionary in the `uns`attribute. + map_topography: Whether to quantify the topography of vector field. Note that for higher than 2D vector field, we can only identify + fixed points as high-dimensional nullcline and separatrices are mathematically difficult to be identified. Nullcline and separatrices will also be a surface or manifold in high-dimensional vector field. + pot_curl_div: Whether to calculate potential, curl or divergence for each cell. Potential can be calculated for any basis while curl and + divergence is by default only applied to 2D basis. However, divergence is applicable for any dimension while curl is generally only defined for 2/3 D systems. + cores: Number of cores to run the ddhodge function. If cores is set to be > 1, multiprocessing will be used to parallel the ddhodge calculation. result_key: The key that will be used as prefix for the vector field key in .uns - copy: - Whether to return a new deep copy of `adata` instead of updating `adata` object passed in arguments and + copy: Whether to return a new deep copy of `adata` instead of updating `adata` object passed in arguments and returning `None`. - kwargs: - Other additional parameters passed to the vectorfield class. - - Returns - ------- - adata: :class:`Union[anndata.AnnData, base_vectorfield]` - If `copy` and `return_vf_object` arguments are set to False, `annData` object is updated with the `VecFld` - dictionary in the `uns` attribute. - If `return_vf_object` is set to True, then a vector field class object is returned. - If `copy` is set to True, a deep copy of the original `adata` object is returned. + kwargs: Other additional parameters passed to the vectorfield class. + + Returns: + If `copy` and `return_vf_object` arguments are set to False, `annData` object is updated with the `VecFld`dictionary in the `uns` attribute. + If `return_vf_object` is set to True, then a vector field class object is returned. + If `copy` is set to True, a deep copy of the original `adata` object is returned. """ logger = LoggerManager.gen_logger("dynamo-topography") logger.info("VectorField reconstruction begins...", indent_level=1) @@ -955,6 +1177,8 @@ def VectorField( **tp_kwargs, ) if pot_curl_div: + from .vector_calculus import curl, divergence + logger.info(f"Running ddhodge to estimate vector field based pseudotime in {basis} basis...") from ..external.hodge import ddhodge @@ -1004,27 +1228,21 @@ def VectorField( def assign_fixedpoints( - adata: anndata.AnnData, + adata: AnnData, basis: str = "pca", cores: int = 1, copy: bool = False, -) -> Union[None, anndata.AnnData]: +) -> Optional[AnnData]: """Assign each cell in our data to a fixed point. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains reconstructed vector field in the `basis` space. - basis: - The vector field function for the `basis` that will be used to assign fixed points for each cell. - cores: - Number of cores to run the fixed-point search for each cell. - copy: - Whether to return a new deep copy of `adata` instead of updating `adata` object passed in arguments and + Args: + adata: AnnData object that contains reconstructed vector field in the `basis` space. + basis: The vector field function for the `basis` that will be used to assign fixed points for each cell. + cores: Number of cores to run the fixed-point search for each cell. + copy: Whether to return a new deep copy of `adata` instead of updating `adata` object passed in arguments and returning `None`. - Returns - ------- + Returns: adata: :class:`Union[None, anndata.AnnData]` If `copy` is set to False, return None but the adata object will updated with a `fps_assignment` in .obs as well as the `'fps_assignment_' + basis` in the .uns. diff --git a/dynamo/vectorfield/utils.py b/dynamo/vectorfield/utils.py index 3e8858d97..15af7da91 100644 --- a/dynamo/vectorfield/utils.py +++ b/dynamo/vectorfield/utils.py @@ -1,39 +1,67 @@ -import functools -import inspect import itertools import multiprocessing as mp +import sys from multiprocessing.dummy import Pool as ThreadPool -from typing import Callable, Union +from typing import Callable, Dict, List, Optional, Tuple, Union import numdifftools as nd import numpy as np import pandas as pd +from anndata import AnnData from scipy.optimize import fsolve from scipy.sparse import issparse from scipy.spatial.distance import cdist, pdist from tqdm import tqdm from ..dynamo_logger import LoggerManager, main_info -from ..tools.utils import ( - form_triu_matrix, - index_condensed_matrix, - subset_dict_with_key_list, - timeit, -) +from ..tools.utils import form_triu_matrix, index_condensed_matrix, timeit from .FixedPoints import FixedPoints - -def is_outside_domain(x, domain): +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + + +class NormDict(TypedDict): + xm: np.ndarray + ym: np.ndarray + xscale: float + yscale: float + fix_velocity: bool + + +class VecFldDict(TypedDict): + X: np.ndarray + valid_ind: float + X_ctrl: np.ndarray + ctrl_idx: float + Y: np.ndarray + beta: float + V: np.ndarray + C: np.ndarray + P: np.ndarray + VFCIndex: np.ndarray + sigma2: float + grid: np.ndarray + grid_V: np.ndarray + iteration: int + tecr_traj: np.ndarray + E_traj: np.ndarray + norm_dict: NormDict + + +def is_outside_domain(x: np.ndarray, domain: Tuple[float, float]) -> np.ndarray: x = x[None, :] if x.ndim == 1 else x return np.any(np.logical_or(x < domain[0], x > domain[1]), axis=1) -def grad(f, x): +def grad(f: Callable, x: np.ndarray) -> nd.Gradient: """Gradient of scalar-valued function f evaluated at x""" return nd.Gradient(f)(x) -def laplacian(f, x): +def laplacian(f: Callable, x: np.ndarray) -> float: """Laplacian of scalar field f evaluated at x""" hes = nd.Hessdiag(f)(x) return sum(hes) @@ -42,9 +70,29 @@ def laplacian(f, x): # --------------------------------------------------------------------------------------------------- # vector field function @timeit -def vector_field_function(x, vf_dict, dim=None, kernel="full", X_ctrl_ind=None, **kernel_kwargs): +def vector_field_function( + x: np.ndarray, + vf_dict: VecFldDict, + dim: Optional[Union[int, np.ndarray]] = None, + kernel: str = "full", + X_ctrl_ind: Optional[List] = None, + **kernel_kwargs, +) -> np.ndarray: """vector field function constructed by sparseVFC. Reference: Regularized vector field learning with sparse approximation for mismatch removal, Ma, Jiayi, etc. al, Pattern Recognition + + Args: + x: Set of cell expression state samples + vf_dict: VecFldDict with stored parameters necessary for reconstruction + dim: Index or indices of dimensions of the K gram matrix to return. Defaults to None. + kernel: one of {"full", "df_kernel", "cf_kernel"}. Defaults to "full". + X_ctrl_ind: Indices of control points at which kernels will be centered. Defaults to None. + + Raises: + ValueError: If the kernel value specified is not one of "full", "df_kernel", or "cf_kernel" + + Returns: + np.ndarray storing the `dim` dimensions of m x m gram matrix K storing the kernel evaluated at each pair of control points """ # x=np.array(x).reshape((1, -1)) if "div_cur_free_kernels" in vf_dict.keys(): @@ -94,7 +142,9 @@ def vector_field_function(x, vf_dict, dim=None, kernel="full", X_ctrl_ind=None, return K -def dynode_vector_field_function(x, vf_dict, dim=None, **kwargs): +def dynode_vector_field_function( + x: np.ndarray, vf_dict: VecFldDict, dim: Optional[Union[int, np.ndarray]] = None, **kwargs +) -> np.ndarray: # try: # import dynode # from dynode.vectorfield import Dynode @@ -127,24 +177,19 @@ def dynode_vector_field_function(x, vf_dict, dim=None, **kwargs): @timeit -def con_K(x, y, beta, method="cdist", return_d=False): +def con_K( + x: np.ndarray, y: np.ndarray, beta: float = 0.1, method: str = "cdist", return_d: bool = False +) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray]: """con_K constructs the kernel K, where K(i, j) = k(x, y) = exp(-beta * ||x - y||^2). - Arguments - --------- - x: :class:`~numpy.ndarray` - Original training data points. - y: :class:`~numpy.ndarray` - Control points used to build kernel basis functions. - beta: float (default: 0.1) - Paramerter of Gaussian Kernel, k(x, y) = exp(-beta*||x-y||^2), - return_d: bool - If True the intermediate 3D matrix x - y will be returned for analytical Jacobian. - - Returns - ------- - K: :class:`~numpy.ndarray` - the kernel to represent the vector field function. + Args: + x: Original training data points. + y: Control points used to build kernel basis functions. + beta: Paramerter of Gaussian Kernel, k(x, y) = exp(-beta*||x-y||^2), + return_d: If True the intermediate 3D matrix x - y will be returned for analytical Jacobian. + + Returns: + Tuple(K: the kernel to represent the vector field function, D: """ if method == "cdist" and not return_d: K = cdist(x, y, "sqeuclidean") @@ -168,23 +213,19 @@ def con_K(x, y, beta, method="cdist", return_d=False): @timeit -def con_K_div_cur_free(x, y, sigma=0.8, eta=0.5): +def con_K_div_cur_free( + x: np.ndarray, y: np.ndarray, sigma: int = 0.8, eta: float = 0.5 +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Construct a convex combination of the divergence-free kernel T_df and curl-free kernel T_cf with a bandwidth sigma and a combination coefficient gamma. - Arguments - --------- - x: :class:`~numpy.ndarray` - Original training data points. - y: :class:`~numpy.ndarray` - Control points used to build kernel basis functions - sigma: int (default: `0.8`) - Bandwidth parameter. - eta: int (default: `0.5`) - Combination coefficient for the divergence-free or the curl-free kernels. - - Returns - ------- + Args: + x: Original training data points. + y: Control points used to build kernel basis functions + sigma: Bandwidth parameter. + eta: Combination coefficient for the divergence-free or the curl-free kernels. + + Returns: A tuple of G (the combined kernel function), divergence-free kernel and curl-free kernel. See also: :func:`sparseVFC`. @@ -225,7 +266,20 @@ def con_K_div_cur_free(x, y, sigma=0.8, eta=0.5): return G, df_kernel, cf_kernel -def get_vf_dict(adata, basis="", vf_key="VecFld"): +def get_vf_dict(adata: AnnData, basis: str = "", vf_key: str = "VecFld") -> VecFldDict: + """Get vector field dictionary from the `.uns` attribute of the AnnData object. + + Args: + adata: `AnnData` object + basis: string indicating the embedding data to use for calculating velocities. Defaults to "". + vf_key: _description_. Defaults to "VecFld". + + Raises: + ValueError: if vf_key or vfkey_basis is not included in the adata object. + + Returns: + vector field dictionary + """ if basis is not None: if len(basis) > 0: vf_key = "%s_%s" % (vf_key, basis) @@ -240,7 +294,7 @@ def get_vf_dict(adata, basis="", vf_key="VecFld"): return vf_dict -def vecfld_from_adata(adata, basis="", vf_key="VecFld"): +def vecfld_from_adata(adata: AnnData, basis: str = "", vf_key: str = "VecFld") -> Tuple[VecFldDict, Callable]: vf_dict = get_vf_dict(adata, basis=basis, vf_key=vf_key) method = vf_dict["method"] @@ -254,51 +308,39 @@ def vecfld_from_adata(adata, basis="", vf_key="VecFld"): return vf_dict, func -def vector_transformation(V, Q): +def vector_transformation(V: np.ndarray, Q: np.ndarray) -> np.ndarray: """Transform vectors from PCA space to the original space using the formula: :math:`\hat{v} = v Q^T`, where `Q, v, \hat{v}` are the PCA loading matrix, low dimensional vector and the transformed high dimensional vector. - Parameters - ---------- - V: :class:`~numpy.ndarray` - The n x k array of vectors to be transformed, where n is the number of vectors, + Args: + V: The n x k array of vectors to be transformed, where n is the number of vectors, k the dimension. - Q: :class:`~numpy.ndarray` - PCA loading matrix with dimension d x k, where d is the dimension of the original space, + Q: PCA loading matrix with dimension d x k, where d is the dimension of the original space, and k the number of leading PCs. - Returns - ------- - ret: :class:`~numpy.ndarray` - The array of transformed vectors. - + Returns: + ret: The array of transformed vectors. """ return V @ Q.T -def vector_field_function_transformation(vf_func, Q, func_inv_x): +def vector_field_function_transformation(vf_func: Callable, Q: np.ndarray, func_inv_x: Callable) -> Callable: """Transform vector field function from PCA space to the original space. The formula used for transformation: :math:`\hat{f} = f Q^T`, where `Q, f, \hat{f}` are the PCA loading matrix, low dimensional vector field function and the transformed high dimensional vector field function. - Parameters - ---------- - vf_func: callable - The vector field function. - Q: :class:`~numpy.ndarray` - PCA loading matrix with dimension d x k, where d is the dimension of the original space, + Args: + vf_func: The vector field function. + Q: PCA loading matrix with dimension d x k, where d is the dimension of the original space, and k the number of leading PCs. - func_inv_x: callable - The function that transform x back into the PCA space. + func_inv_x: The function that transform x back into the PCA space. - Returns - ------- - ret: callable - The transformed vector field function. + Returns: + The transformed vector field function. """ return lambda x: vf_func(func_inv_x(x)) @ Q.T @@ -306,23 +348,18 @@ def vector_field_function_transformation(vf_func, Q, func_inv_x): # --------------------------------------------------------------------------------------------------- # jacobian -def Jacobian_rkhs_gaussian(x, vf_dict, vectorize=False): +def Jacobian_rkhs_gaussian(x: np.ndarray, vf_dict: VecFldDict, vectorize: bool = False) -> np.ndarray: """analytical Jacobian for RKHS vector field functions with Gaussian kernel. - Arguments - --------- - x: :class:`~numpy.ndarray` - Coordinates where the Jacobian is evaluated. - vf_dict: dict - A dictionary containing RKHS vector field control points, Gaussian bandwidth, + Args: + x: Coordinates where the Jacobian is evaluated. + vf_dict: A dictionary containing RKHS vector field control points, Gaussian bandwidth, and RKHS coefficients. Essential keys: 'X_ctrl', 'beta', 'C' - Returns - ------- - J: :class:`~numpy.ndarray` + Returns: Jacobian matrices stored as d-by-d-by-n numpy arrays evaluated at x. - d is the number of dimensions and n the number of coordinates in x. + d is the number of dimensions and n the number of coordinates in x. """ if x.ndim == 1: K, D = con_K(x[None, :], vf_dict["X_ctrl"], vf_dict["beta"], return_d=True) @@ -342,7 +379,7 @@ def Jacobian_rkhs_gaussian(x, vf_dict, vectorize=False): return -2 * vf_dict["beta"] * J -def Jacobian_rkhs_gaussian_parallel(x, vf_dict, cores=None): +def Jacobian_rkhs_gaussian_parallel(x: np.ndarray, vf_dict: VecFldDict, cores: Optional[int] = None) -> np.ndarray: n = len(x) if cores is None: cores = mp.cpu_count() @@ -359,7 +396,7 @@ def Jacobian_rkhs_gaussian_parallel(x, vf_dict, cores=None): return ret -def Jacobian_numerical(f: Callable, input_vector_convention: str = "row"): +def Jacobian_numerical(f: Callable, input_vector_convention: str = "row") -> Union[Callable, nd.Jacobian]: """ Get the numerical Jacobian of the vector field function. If the input_vector_convention is 'row', it means that fjac takes row vectors @@ -390,7 +427,7 @@ def f_aux(x): @timeit -def elementwise_jacobian_transformation(Js, qi, qj): +def elementwise_jacobian_transformation(Js: np.ndarray, qi: np.ndarray, qj: np.ndarray) -> np.ndarray: """Inverse transform low dimensional k x k Jacobian matrix (:math:`\partial F_i / \partial x_j`) back to the d-dimensional gene expression space. The formula used to inverse transform Jacobian matrix calculated from low dimension (PCs) is: @@ -398,19 +435,13 @@ def elementwise_jacobian_transformation(Js, qi, qj): where `Q, J, Jac` are the PCA loading matrix, low dimensional Jacobian matrix and the inverse transformed high dimensional Jacobian matrix. This function takes only one row from Q to form qi or qj. - Parameters - ---------- - Js: :class:`~numpy.ndarray` - k x k x n matrices of n k-by-k Jacobians. - qi: :class:`~numpy.ndarray` - The i-th row of the PC loading matrix Q with dimension d x k, corresponding to the effector gene i. - qj: :class:`~numpy.ndarray` - The j-th row of the PC loading matrix Q with dimension d x k, corresponding to the regulator gene j. - - Returns - ------- - ret: :class:`~numpy.ndarray` - The calculated Jacobian elements (:math:`\partial F_i / \partial x_j`) for each cell. + Args: + Js: k x k x n matrices of n k-by-k Jacobians. + qi: The i-th row of the PC loading matrix Q with dimension d x k, corresponding to the effector gene i. + qj: The j-th row of the PC loading matrix Q with dimension d x k, corresponding to the regulator gene j. + + Returns: + The calculated Jacobian elements (:math:`\partial F_i / \partial x_j`) for each cell. """ Js = np.atleast_3d(Js) @@ -422,23 +453,21 @@ def elementwise_jacobian_transformation(Js, qi, qj): return ret -def Jacobian_kovf(x, fjac_base, K, Q, exact=False, mu=None): +def Jacobian_kovf( + x: np.ndarray, fjac_base: Callable, K: np.ndarray, Q: np.ndarray, exact: bool = False, mu: Optional[float] = None +) -> np.ndarray: """analytical Jacobian for RKHS vector field functions with Gaussian kernel. - Arguments - --------- - x: :class:`~numpy.ndarray` - Coordinates where the Jacobian is evaluated. - vf_dict: dict - A dictionary containing RKHS vector field control points, Gaussian bandwidth, - and RKHS coefficients. - Essential keys: 'X_ctrl', 'beta', 'C' + Args: + x: Coordinates where the Jacobian is evaluated. + vf_dict: + A dictionary containing RKHS vector field control points, Gaussian bandwidth, + and RKHS coefficients. + Essential keys: 'X_ctrl', 'beta', 'C' - Returns - ------- - J: :class:`~numpy.ndarray` + Returns: Jacobian matrices stored as d-by-d-by-n numpy arrays evaluated at x. - d is the number of dimensions and n the number of coordinates in x. + d is the number of dimensions and n the number of coordinates in x. """ if K.ndim == 1: K = np.diag(K) @@ -465,35 +494,24 @@ def Jacobian_kovf(x, fjac_base, K, Q, exact=False, mu=None): @timeit -def subset_jacobian_transformation(Js, Qi, Qj, cores=1): +def subset_jacobian_transformation(Js: np.ndarray, Qi: np.ndarray, Qj: np.ndarray, cores: int = 1) -> np.ndarray: """Transform Jacobian matrix (:math:`\partial F_i / \partial x_j`) from PCA space to the original space. The formula used for transformation: :math:`\hat{J} = Q J Q^T`, where `Q, J, \hat{J}` are the PCA loading matrix, low dimensional Jacobian matrix and the inverse transformed high dimensional Jacobian matrix. This function takes multiple rows from Q to form Qi or Qj. - Parameters - ---------- - fjac: callable - The function for calculating numerical Jacobian matrix. - X: :class:`~numpy.ndarray` - The samples coordinates with dimension n_obs x n_PCs, from which Jacobian will be calculated. - Qi: :class:`~numpy.ndarray` - PCA loading matrix with dimension n' x n_PCs of the effector genes, from which local dimension Jacobian matrix (k x k) + Args: + Js: Original (k x k) dimension Jacobian matrix + Qi: PCA loading matrix with dimension n' x n_PCs of the effector genes, from which local dimension Jacobian matrix (k x k) will be inverse transformed back to high dimension. - Qj: :class:`~numpy.ndarray` - PCs loading matrix with dimension n' x n_PCs of the regulator genes, from which local dimension Jacobian matrix (k x k) + Qj: PCs loading matrix with dimension n' x n_PCs of the regulator genes, from which local dimension Jacobian matrix (k x k) will be inverse transformed back to high dimension. - cores: int (default: 1): - Number of cores to calculate Jacobian. If cores is set to be > 1, multiprocessing will be used to + cores: Number of cores to calculate Jacobian. If cores is set to be > 1, multiprocessing will be used to parallel the Jacobian calculation. - return_J: bool (default: False) - Whether to return the raw tensor of Jacobian matrix of each cell before transformation. - Returns - ------- - ret: :class:`~numpy.ndarray` - The calculated Jacobian matrix (n_gene x n_gene x n_obs) for each cell. + Returns: + The calculated Jacobian matrix (n_gene x n_gene x n_obs) for each cell. """ Js = np.atleast_3d(Js) @@ -523,7 +541,7 @@ def subset_jacobian_transformation(Js, Qi, Qj, cores=1): return ret -def transform_jacobian(Js, Qi, Qj, pbar=False): +def transform_jacobian(Js: np.ndarray, Qi: np.ndarray, Qj: np.ndarray, pbar=False) -> np.ndarray: d1, d2, n = Qi.shape[0], Qj.shape[0], Js.shape[2] ret = np.zeros((d1, d2, n), dtype=np.float32) if pbar: @@ -536,10 +554,17 @@ def transform_jacobian(Js, Qi, Qj, pbar=False): return ret -def average_jacobian_by_group(Js, group_labels): +def average_jacobian_by_group(Js: np.ndarray, group_labels: List[str]) -> Dict[str, np.ndarray]: """ Returns a dictionary of averaged jacobians with group names as the keys. No vectorized indexing was used due to its high memory cost. + + Args: + Js: List of Jacobian matrices + group_labels: list of group labels + + Returns: + dictionary with group labels as keys and average Jacobians as values """ groups = np.unique(group_labels) @@ -561,23 +586,18 @@ def average_jacobian_by_group(Js, group_labels): # Hessian -def Hessian_rkhs_gaussian(x, vf_dict): +def Hessian_rkhs_gaussian(x: np.ndarray, vf_dict: VecFldDict) -> np.ndarray: """analytical Hessian for RKHS vector field functions with Gaussian kernel. - Arguments - --------- - x: :class:`~numpy.ndarray` - Coordinates where the Hessian is evaluated. Note that x has to be 1D. - vf_dict: dict - A dictionary containing RKHS vector field control points, Gaussian bandwidth, - and RKHS coefficients. - Essential keys: 'X_ctrl', 'beta', 'C' + Args: + x: Coordinates where the Hessian is evaluated. Note that x has to be 1D. + vf_dict: A dictionary containing RKHS vector field control points, Gaussian bandwidth, + and RKHS coefficients. + Essential keys: 'X_ctrl', 'beta', 'C' - Returns - ------- - H: :class:`~numpy.ndarray` - Hessian matrix stored as d-by-d-by-d numpy arrays evaluated at x. - d is the number of dimensions. + Returns: + H: Hessian matrix stored as d-by-d-by-d numpy arrays evaluated at x. + d is the number of dimensions. """ x = np.atleast_2d(x) @@ -595,7 +615,7 @@ def Hessian_rkhs_gaussian(x, vf_dict): return H -def hessian_transformation(H, qi, Qj, Qk): +def hessian_transformation(H: np.ndarray, qi: np.ndarray, Qj: np.ndarray, Qk: np.ndarray) -> np.ndarray: """Inverse transform low dimensional k x k x k Hessian matrix (:math:`\partial^2 F_i / \partial x_j \partial x_k`) back to the d-dimensional gene expression space. The formula used to inverse transform Hessian matrix calculated from low dimension (PCs) is: @@ -603,21 +623,14 @@ def hessian_transformation(H, qi, Qj, Qk): where `q, H, h` are the PCA loading matrix, low dimensional Hessian matrix and the inverse transformed element from the high dimensional Hessian matrix. - Parameters - ---------- - H: :class:`~numpy.ndarray` - k x k x k matrix of the Hessian. - qi: :class:`~numpy.ndarray` - The i-th row of the PC loading matrix Q with dimension d x k, corresponding to the effector i. - Qj: :class:`~numpy.ndarray` - The submatrix of the PC loading matrix Q with dimension d x k, corresponding to regulators j. - Qk: :class:`~numpy.ndarray` - The submatrix of the PC loading matrix Q with dimension d x k, corresponding to co-regulators k. - - Returns - ------- - h: :class:`~numpy.ndarray` - The calculated Hessian matrix for the effector i w.r.t regulators j and co-regulators k. + Args: + H: k x k x k matrix of the Hessian. + qi: The i-th row of the PC loading matrix Q with dimension d x k, corresponding to the effector i. + Qj: The submatrix of the PC loading matrix Q with dimension d x k, corresponding to regulators j. + Qk: The submatrix of the PC loading matrix Q with dimension d x k, corresponding to co-regulators k. + + Returns: + h: The calculated Hessian matrix for the effector i w.r.t regulators j and co-regulators k. """ h = np.einsum("ijk, di -> djk", H, qi) @@ -627,27 +640,22 @@ def hessian_transformation(H, qi, Qj, Qk): return h -def elementwise_hessian_transformation(H, qi, qj, qk): +def elementwise_hessian_transformation(H: np.ndarray, qi: np.ndarray, qj: np.ndarray, qk: np.ndarray) -> np.ndarray: """Inverse transform low dimensional k x k x k Hessian matrix (:math:`\partial^2 F_i / \partial x_j \partial x_k`) back to the d-dimensional gene expression space. The formula used to inverse transform Hessian matrix calculated from low dimension (PCs) is: :math:`Jac = Q J Q^T`, where `Q, J, Jac` are the PCA loading matrix, low dimensional Jacobian matrix and the inverse transformed high dimensional Jacobian matrix. This function takes only one row from Q to form qi or qj. - Parameters - ---------- - H: :class:`~numpy.ndarray` - k x k x k matrix of the Hessian. - qi: :class:`~numpy.ndarray` - The i-th row of the PC loading matrix Q with dimension d x k, corresponding to the effector i. - qj: :class:`~numpy.ndarray` - The j-th row of the PC loading matrix Q with dimension d x k, corresponding to the regulator j. - qk: :class:`~numpy.ndarray` - The k-th row of the PC loading matrix Q with dimension d x k, corresponding to the co-regulator k. - Returns - ------- - h: :class:`~numpy.ndarray` - The calculated Hessian elements for each cell. + + Args: + H: k x k x k matrix of the Hessian. + qi: The i-th row of the PC loading matrix Q with dimension d x k, corresponding to the effector i. + qj: The j-th row of the PC loading matrix Q with dimension d x k, corresponding to the regulator j. + qk: The k-th row of the PC loading matrix Q with dimension d x k, corresponding to the co-regulator k. + + Returns: + h: The calculated Hessian elements for each cell. """ h = np.einsum("ijk, i -> jk", H, qi) @@ -657,7 +665,13 @@ def elementwise_hessian_transformation(H, qi, qj, qk): # --------------------------------------------------------------------------------------------------- -def Laplacian(H): +def Laplacian(H: np.ndarray) -> np.ndarray: + """ + Computes the Laplacian of the Hessian matrix by summing the diagonal elements of the Hessian matrix (summing the unmixed second partial derivatives) + :math: `\Delta f = \sum_{i=1}^{n} \frac{\partial^2 f}{\partial x_i^2}` + Args: + H: Hessian matrix + """ # when H has four dimensions, H is calculated across all cells if H.ndim == 4: L = np.zeros([H.shape[2], H.shape[3]]) @@ -675,19 +689,30 @@ def Laplacian(H): # --------------------------------------------------------------------------------------------------- # dynamical properties -def _divergence(f, x): +def _divergence(f: Callable, x: np.ndarray) -> float: """Divergence of the reconstructed vector field function f evaluated at x""" jac = nd.Jacobian(f)(x) return np.trace(jac) @timeit -def compute_divergence(f_jac, X, Js=None, vectorize_size=1000): +def compute_divergence( + f_jac: Callable, X: np.ndarray, Js: Optional[np.ndarray] = None, vectorize_size: int = 1000 +) -> np.ndarray: """Calculate divergence for many samples by taking the trace of a Jacobian matrix. vectorize_size is used to control the number of samples computed in each vectorized batch. If vectorize_size = 1, there's no vectorization whatsoever. If vectorize_size = None, all samples are vectorized. + + Args: + f_jac: function for calculating Jacobian from cell states + X: cell states + Js: Jacobian matrices for each sample, if X is not provided + vectorize_size: number of Jacobian matrices to process at once in the vectorization + + Returns: + divergence np.ndarray across Jacobians for many samples """ n = len(X) if vectorize_size is None: @@ -700,13 +725,22 @@ def compute_divergence(f_jac, X, Js=None, vectorize_size=1000): return div -def acceleration_(v, J): +def acceleration_(v: np.ndarray, J: np.ndarray) -> np.ndarray: + """Calculate acceleration by dotting the Jacobian and the velocity vector. + + Args: + v: velocity vector + J: Jacobian matrix + + Returns: + Acceleration vector, with one element for the acceleration of each component + """ if v.ndim == 1: v = v[:, None] return J.dot(v) -def curvature_method1(a: np.array, v: np.array): +def curvature_method1(a: np.array, v: np.array) -> float: """https://link.springer.com/article/10.1007/s12650-018-0474-6""" if v.ndim == 1: v = v[:, None] @@ -715,7 +749,7 @@ def curvature_method1(a: np.array, v: np.array): return kappa -def curvature_method2(a: np.array, v: np.array): +def curvature_method2(a: np.array, v: np.array) -> float: """https://dl.acm.org/doi/10.5555/319351.319441""" # if v.ndim == 1: v = v[:, None] kappa = (np.multiply(a, np.dot(v, v)) - np.multiply(v, np.dot(v, a))) / np.linalg.norm(v) ** 4 @@ -882,7 +916,18 @@ def compute_curl(f_jac, X): # --------------------------------------------------------------------------------------------------- # ranking related utilies -def get_metric_gene_in_rank(mat: np.mat, genes: list, neg: bool = False): +def get_metric_gene_in_rank(mat: np.mat, genes: list, neg: bool = False) -> Tuple[np.ndarray, np.ndarray]: + """Calculate ranking of genes based on mean value of matrix. + + Args: + mat: A matrix with shape (n, m) where n is the number of samples and m is the number of genes. + genes: A list of m gene names. + neg: A boolean flag indicating whether the ranking should be in ascending (True) or descending (False) order. Defaults to False (descending order). + + Returns: + metric_in_rank: A list of m gene scores, sorted in ascending or descending order depending on the value of neg. + genes_in_rank: A list of m gene names, sorted in the same order as metric_in_rank. + """ metric_in_rank = mat.mean(0).A1 if issparse(mat) else mat.mean(0) rank = metric_in_rank.argsort() if neg else metric_in_rank.argsort()[::-1] metric_in_rank, genes_in_rank = metric_in_rank[rank], genes[rank] @@ -892,7 +937,21 @@ def get_metric_gene_in_rank(mat: np.mat, genes: list, neg: bool = False): def get_metric_gene_in_rank_by_group( mat: np.mat, genes: list, groups: np.array, selected_group, neg: bool = False -) -> tuple: +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Calculate ranking of genes based on mean value of matrix, grouped by selected group. + + Args: + mat: A matrix with shape (n, m) where n is the number of samples and m is the number of genes. + genes: A list of m gene names. + groups: A numpy array with shape (n,) indicating the group membership for each sample. + selected_group: The group for which the ranking should be calculated. + neg: A boolean flag indicating whether the ranking should be in ascending (True) or descending (False) order. Defaults to False (descending order). + + Returns: + gene_wise_metrics: A list of m gene scores, sorted in ascending or descending order depending on the value of neg. + group_wise_metrics: A list of m gene scores, calculated as the mean value of mat for samples belonging to the selected group. + genes_in_rank: A list of m gene names, sorted in the same order as gene_wise_metrics. + """ mask = groups == selected_group if type(mask) == pd.Series: mask = mask.values @@ -907,7 +966,18 @@ def get_metric_gene_in_rank_by_group( return gene_wise_metrics, group_wise_metrics, genes_in_rank -def get_sorted_metric_genes_df(df: pd.DataFrame, genes: list, neg: bool = False) -> tuple: +def get_sorted_metric_genes_df(df: pd.DataFrame, genes: list, neg: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Sort metric and gene lists based on values in dataframe. + + Args: + df: A dataframe with shape (m, n) where m is the number of genes and n is the number of samples. + genes: A list of m gene names. + neg: A boolean flag indicating whether the ranking should be in ascending (True) or descending (False) order. Defaults to False (descending order). + + Returns: + sorted_metric: A dataframe with shape (m, n) containing the sorted metric values. + sorted_genes: A dataframe with shape (m, n) containing the sorted gene names. + """ sorted_metric = pd.DataFrame( { key: (sorted(values, reverse=False) if neg else sorted(values, reverse=True)) @@ -923,7 +993,34 @@ def get_sorted_metric_genes_df(df: pd.DataFrame, genes: list, neg: bool = False) return sorted_metric, sorted_genes -def rank_vector_calculus_metrics(mat: np.mat, genes: list, group, groups: list, uniq_group: list) -> tuple: +def rank_vector_calculus_metrics(mat: np.mat, genes: list, group, groups: list, uniq_group: list) -> Tuple: + """Calculate ranking of genes based on vector calculus metric. + + Args: + mat: A matrix with shape (n, m) where n is the number of samples and m is the number of genes. + genes: A list of m gene names. + group: The group for which the ranking should be calculated. If group is None, the ranking is calculated for all samples. + groups: A list of n group labels, indicating the group membership for each sample. + uniq_group: A list of unique group labels. + + Returns: + If group is None: + metric_in_rank: A list of m gene scores, sorted by the mean of the the absolute values of the elements in each column of mat. + genes_in_rank: A list of m gene names, sorted in the same order as metric_in_rank. + pos_metric_in_rank: A list of m gene scores, sorted by the mean of the positive elements in each column of mat. + pos_genes_in_rank: A list of m gene names, sorted in the same order as pos_metric_in_rank. + neg_metric_in_rank: A list of m gene scores, calculated as the mean of the negative elements in each column of mat. + neg_genes_in_rank: A list of m gene names, sorted in the same order as neg_metric_in_rank. + If group is not None: + gene_wise_metrics: A dictionary with keys equal to the unique group labels in uniq_group, and values equal to lists of m gene scores, calculated as the mean value of mat for samples belonging to each group. + group_wise_metrics: A dictionary with keys equal to the unique group labels in uniq_group, and values equal to lists of m gene scores, calculated as the mean value of mat for samples belonging to each group. + gene_wise_genes: A dictionary with keys equal to the unique group labels in uniq_group, and values equal to lists of m gene names, sorted in the same order as the corresponding values in gene_wise_metrics. + gene_wise_pos_metrics: A dictionary with keys equal to the unique group labels in uniq_group, and values equal to lists of m gene scores, calculated as the mean value of the positive elements in mat for samples belonging to each group. + group_wise_pos_metrics: A dictionary with keys equal to the unique group labels in uniq_group, and values equal to lists of m gene scores, calculated as the mean value of the positive elements in mat for samples belonging to each group. + gene_wise_pos_genes: A dictionary with keys equal to the unique group labels in uniq_group, and values equal to lists of m gene names, sorted in the same order as the corresponding values in gene_wise_pos_metrics. + gene_wise_neg_metrics: A dictionary with keys equal to the unique group labels in uniq_group, and values equal to lists of m gene scores, calculated as the mean value of the negative elements in mat for samples belonging to each group. + group_wise_neg_metrics: A dictionary with keys equal to the unique labels in uniq_group, and values equal to lists of m gene scores, calculated as the mean value of the negative elements in mat for samples belonging to each group. + """ main_info("split mat to a positive matrix and a negative matrix.") if issparse(mat): mask = mat.data > 0 @@ -1118,10 +1215,24 @@ def remove_redundant_points(X, tol=1e-4, output_discard=False): def find_fixed_points( x0_list: Union[list, np.ndarray], func_vf: Callable, - domain=None, + domain: Optional[np.ndarray] = None, tol_redundant: float = 1e-4, return_all: bool = False, -) -> tuple: +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Given sampling points, a function, and a domain, finds points for which func_vf(x) = 0. + + Args: + x0_list: Array-like structure with sampling points + func_vf: Function for which to find fixed points + domain: Finds fixed points within the given domain of shape (n_dim, 2) + tol_redundant: Margin outside of which points are considered distinct + return_all: If set to true, always return a tuple of three arrays as output + + Returns: + A tuple with the solutions, Jacobian matrix, and function values at the solutions. + + """ + def vf_aux(x): """auxillary function unifying dimensionality""" v = func_vf(x) @@ -1209,19 +1320,13 @@ def parse_int_df( ) -> pd.DataFrame: """parse the dataframe produced from vector field ranking for gene interactions or switch gene pairs - Parameters - ---------- - df: - The dataframe that returned from performing the `int` or `switch` mode ranking via dyn.vf.rank_jacobian_genes. - self_int: - Whether to keep self-interactions pairs. - genes: - List of genes that are used to filter for gene interactions. - - Returns - ------- - res: - The parsed interaction dataframe. + Args: + df: The dataframe that returned from performing the `int` or `switch` mode ranking via dyn.vf.rank_jacobian_genes. + self_int: Whether to keep self-interactions pairs. + genes: List of genes that are used to filter for gene interactions. + + Returns: + res: The parsed interaction dataframe. """ df_shape, columns = df.shape, df.columns @@ -1258,12 +1363,24 @@ def parse_int_df( # --------------------------------------------------------------------------------------------------- # jacobian retrival related utilies def get_jacobian( - adata, - regulators, - effectors, + adata: AnnData, + regulators: np.ndarray, + effectors: np.ndarray, jkey: str = "jacobian", j_basis: str = "pca", -): +) -> pd.DataFrame: + """Return dataframe with Jacobian values (where regulators are in the denominator and effectors in the numerator) + + Args: + adata: AnnData object + regulators: string labels capturing regulator genes of interest + effectors: string labels capturing effector genes of interest + jkey: Defaults to "jacobian". + j_basis: Defaults to "pca". + + Returns: + dataframe with Jacobian values (where regulators are in the denominator and effectors in the numerator) + """ regulators, effectors = ( list(np.unique(regulators)) if regulators is not None else None, @@ -1314,8 +1431,17 @@ def get_jacobian( # --------------------------------------------------------------------------------------------------- # jacobian subset related utilies -def subset_jacobian(adata, cells, basis="pca"): - """Subset adata object while also subset the jacobian, cells must be a vector of cell indices.""" +def subset_jacobian(adata: AnnData, cells: np.ndarray, basis: str = "pca"): + """Subset adata object while also subset the jacobian + + Args: + adata: AnnData object + cells: vector of cell indices + basis: string specifying jacobian layer to subset + + Returns: + subset of adata + """ adata_subset = adata[cells] diff --git a/dynamo/vectorfield/vector_calculus.py b/dynamo/vectorfield/vector_calculus.py index 889a1b337..372a83db5 100644 --- a/dynamo/vectorfield/vector_calculus.py +++ b/dynamo/vectorfield/vector_calculus.py @@ -2,6 +2,12 @@ # from anndata._core.views import ArrayView # import scipy.sparse as sp +from typing import Dict, List, Optional, Union +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal + import numpy as np import pandas as pd from anndata._core.anndata import AnnData @@ -52,54 +58,65 @@ from .utils import dynode_vector_field_function +def get_vf_class(adata: AnnData, basis: str = "pca") -> SvcVectorField: + """Get the corresponding vector field class according to different methods. + + Args: + adata: AnnData object that contains the reconstructed vector field in the `uns` attribute. + basis: The embedding data in which the vector field was reconstructed. + + Returns: + SvcVectorField object that is extracted from the `uns` attribute of adata. + """ + vf_dict = get_vf_dict(adata, basis=basis) + if "method" not in vf_dict.keys(): + vf_dict["method"] = "sparsevfc" + if vf_dict["method"].lower() == "sparsevfc": + vector_field_class = SvcVectorField() + vector_field_class.from_adata(adata, basis=basis) + elif vf_dict["method"].lower() == "dynode": + vf_dict["parameters"]["load_model_from_buffer"] = True + vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) + else: + raise ValueError("current only support two methods, SparseVFC and dynode") + return vector_field_class + + def velocities( - adata, init_cells, init_states=None, basis=None, vector_field_class=None, layer="X", dims=None, Qkey="PCs" -): + adata: AnnData, + init_cells: Optional[List] = None, + init_states: Optional[list] = None, + basis: Optional[str] = None, + vector_field_class: Optional[scVectorField.BaseVectorField] = None, + layer: Optional[str] = "X", + dims: Optional[Union[int, list]] = None, + Qkey: str = "PCs", +) -> AnnData: """Calculate the velocities for any cell state with the reconstructed vector field function. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field function in the `uns` attribute. - init_cells: list (default: None) - Cell name or indices of the initial cell states for the historical or future cell state prediction with - numerical integration. If the names in init_cells are not find in the adata.obs_name, it will be treated as + Args: + adata: AnnData object that contains the reconstructed vector field function in the `uns` attribute. + init_cells: Cell name or indices of the initial cell states for the historical or future cell state prediction with + numerical integration. If the names in init_cells are not found in the adata.obs_name, they will be treated as cell indices and must be integers. - init_states: :class:`~numpy.ndarray` or None (default: None) - Initial cell states for the historical or future cell state prediction with numerical integration. - basis: str or None (default: None) - The embedding data to use for calculating velocities. If `basis` is either `umap` or `pca`, the + init_states: Initial cell states for the historical or future cell state prediction with numerical integration. + basis: The embedding data to use for calculating velocities. If `basis` is either `umap` or `pca`, the reconstructed trajectory will be projected back to high dimensional space via the `inverse_transform` function. - vector_field_class: :class:`~scVectorField.vectorfield` - If not None, the speed will be computed using this class instead of the vector field stored in adata. You + vector_field_class: If not None, the speed will be computed using this class instead of the vector field stored in adata. You can set up the class with a known ODE function, useful when the data is generated through simulation. - layer: str or None (default: 'X') - Which layer of the data will be used for predicting cell fate with the reconstructed vector field function. - The layer once provided, will override the `basis` argument and then predicting cell fate in high + layer: Which layer of the data will be used for predicting cell fate with the reconstructed vector field function. + The layer once provided, will override the `basis` argument and this function will then predict cell fate in high dimensional space. - dims: int, list, or None (default: None) - The dimensions that will be selected for velocity calculation. - Qkey: str (default: 'PCs') - The key of the PCA loading matrix in `.uns`. Only used when basis is `pca`. - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `"velocities"` related key in the `.uns`. + dims: The dimensions that will be selected for velocity calculation. + Qkey: The key of the PCA loading matrix in `.uns`. Only used when basis is `pca`. + + Returns: + AnnData object that is updated with the `"velocities"` related key in the `.uns`. """ if vector_field_class is None: - vf_dict = get_vf_dict(adata, basis=basis) - if "method" not in vf_dict.keys(): - vf_dict["method"] = "sparsevfc" - if vf_dict["method"].lower() == "sparsevfc": - vector_field_class = SvcVectorField() - vector_field_class.from_adata(adata, basis=basis) - elif vf_dict["method"].lower() == "dynode": - vf_dict["parameters"]["load_model_from_buffer"] = True - vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) - else: - raise ValueError("current only support two methods, SparseVFC and dynode") + vector_field_class = get_vf_class(adata, basis=basis) init_states, _, _, _ = fetch_states(adata, init_states, init_cells, basis, layer, False, None) @@ -138,45 +155,28 @@ def velocities( def speed( - adata, - basis="umap", - vector_field_class=None, - method="analytical", -): + adata: AnnData, + basis: Optional[str] = "umap", + vector_field_class: Optional[scVectorField.BaseVectorField] = None, + method: str = "analytical", +) -> AnnData: """Calculate the speed for each cell with the reconstructed vector field function. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field function in the `uns` attribute. - basis: str or None (default: `umap`) - The embedding data in which the vector field was reconstructed. - vector_field_class: :class:`~scVectorField.vectorfield` - If not None, the speed will be computed using this class instead of the vector field stored in adata. You + Args: + adata: AnnData object that contains the reconstructed vector field function in the `uns` attribute. + basis: The embedding data in which the vector field was reconstructed. + vector_field_class: If not None, the speed will be computed using this class instead of the vector field stored in adata. You can set up the class with a known ODE function, useful when the data is generated through simulation. - method: str (default: `analytical`) - The method that will be used for calculating speed, either `analytical` or `numeric`. `analytical` + method: The method that will be used for calculating speed, either `analytical` or `numeric`. `analytical` method will use the analytical form of the reconstructed vector field for calculating Jacobian. Otherwise, raw velocity vectors are used. - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `'speed'` key in the `.obs`. + Returns: + AnnData object that is updated with the `'speed'` key in the `.obs`. """ if vector_field_class is None: - vf_dict = get_vf_dict(adata, basis=basis) - if "method" not in vf_dict.keys(): - vf_dict["method"] = "sparsevfc" - if vf_dict["method"].lower() == "sparsevfc": - vector_field_class = SvcVectorField() - vector_field_class.from_adata(adata, basis=basis) - elif vf_dict["method"].lower() == "dynode": - vf_dict["parameters"]["load_model_from_buffer"] = True - vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) - else: - raise ValueError("current only support two methods, SparseVFC and dynode") + vector_field_class = get_vf_class(adata, basis=basis) X, V = vector_field_class.get_data() @@ -193,17 +193,17 @@ def speed( def jacobian( - adata, - regulators=None, - effectors=None, - cell_idx=None, - sampling=None, - sample_ncells=1000, - basis="pca", - Qkey="PCs", - vector_field_class=None, - method="analytical", - store_in_adata=True, + adata: AnnData, + regulators: Optional[List] = None, + effectors: Optional[List] = None, + cell_idx: Optional[List] = None, + sampling: Optional[Literal['random', 'velocity', 'trn']] = None, + sample_ncells: int = 1000, + basis: str = "pca", + Qkey: str = "PCs", + vector_field_class: Optional[scVectorField.BaseVectorField] = None, + method: str = "analytical", + store_in_adata: bool = True, **kwargs, ): """Calculate Jacobian for each cell with the reconstructed vector field. @@ -213,63 +213,39 @@ def jacobian( supported shortly. Note that we compute the Jacobian for the RKHS kernel vector field analytically, which is much more computationally efficient than the numerical method. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field in `.uns`. - regulators: list - The list of genes that will be used as regulators when calculating the cell-wise Jacobian matrix. The + Args: + adata: AnnData object that contains the reconstructed vector field in `.uns`. + regulators: The list of genes that will be used as regulators when calculating the cell-wise Jacobian matrix. The Jacobian is the matrix consisting of partial derivatives of the vector field wrt gene expressions. It can be used to evaluate the change in velocities of effectors (see below) as the expressions of regulators increase. The regulators are the denominators of the partial derivatives. - effectors: list or None (default: None) - The list of genes that will be used as effectors when calculating the cell-wise Jacobian matrix. The + effectors: The list of genes that will be used as effectors when calculating the cell-wise Jacobian matrix. The effectors are the numerators of the partial derivatives. - cell_idx: list or None (default: None) - A list of cell index (or boolean flags) for which the jacobian is calculated. + cell_idx: A list of cell index (or boolean flags) for which the jacobian is calculated. If `None`, all or a subset of sampled cells are used. sampling: {None, 'random', 'velocity', 'trn'}, (default: None) See specific information on these methods in `.tl.sample`. If `None`, all cells are used. - sample_ncells: int (default: 1000) - The number of cells to be sampled. If `sampling` is None, this parameter is ignored. - basis: str (default: 'pca') - The embedding data in which the vector field was reconstructed. If `None`, use the vector field function - that was reconstructed directly from the original unredu ced gene expression space. - Qkey: str (default: 'PCs') - The key of the PCA loading matrix in `.uns`. - vector_field_class: :class:`~scVectorField.vectorfield` - If not `None`, the jacobian will be computed using this class instead of the vector field stored in adata. - method: str (default: 'analytical') - The method that will be used for calculating Jacobian, either `'analytical'` or `'numerical'`. + sample_ncells: The number of cells to be sampled. If `sampling` is None, this parameter is ignored. + basis: The embedding data in which the vector field was reconstructed. If `None`, use the vector field function + that was reconstructed directly from the original unreduced gene expression space. + Qkey: The key of the PCA loading matrix in `.uns`. + vector_field_class: If not `None`, the jacobian will be computed using this class instead of the vector field stored in adata. + method: The method that will be used for calculating Jacobian, either `'analytical'` or `'numerical'`. `'analytical'` method uses the analytical expressions for calculating Jacobian while `'numerical'` method uses numdifftools, a numerical differentiation tool, for computing Jacobian. `'analytical'` method is much more efficient. - cores: int (default: 1) - Number of cores to calculate Jacobian. If cores is set to be > 1, multiprocessing will be used to + cores: Number of cores to calculate Jacobian. If cores is set to be > 1, multiprocessing will be used to parallel the Jacobian calculation. - kwargs: - Any additional keys that will be passed to elementwise_jacobian_transformation function. + kwargs: Any additional keys that will be passed to `elementwise_jacobian_transformation` function. - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `'jacobian'` key in the `.uns`. This is a 3-dimensional tensor with + Returns: + AnnData object that is updated with the `'jacobian'` key in the `.uns`. This is a 3-dimensional tensor with dimensions n_effectors x n_regulators x n_obs. """ if vector_field_class is None: - vf_dict = get_vf_dict(adata, basis=basis) - if "method" not in vf_dict.keys(): - vf_dict["method"] = "sparsevfc" - if vf_dict["method"].lower() == "sparsevfc": - vector_field_class = SvcVectorField() - vector_field_class.from_adata(adata, basis=basis) - elif vf_dict["method"].lower() == "dynode": - vf_dict["parameters"]["load_model_from_buffer"] = True - vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) - else: - raise ValueError("current only support two methods, SparseVFC and dynode") + vector_field_class = get_vf_class(adata, basis=basis) if basis == "umap": cell_idx = np.arange(adata.n_obs) @@ -359,18 +335,18 @@ def jacobian( def hessian( - adata, - regulators, - coregulators, - effector, - cell_idx=None, - sampling=None, - sample_ncells=1000, - basis="pca", - Qkey="PCs", - vector_field_class=None, - method="analytical", - store_in_adata=True, + adata: AnnData, + regulators: List, + coregulators: List, + effector: Optional[List] = None, + cell_idx: Optional[List] = None, + sampling: Optional[Literal['random', 'velocity', 'trn']] = None, + sample_ncells: int = 1000, + basis: str = "pca", + Qkey: str = "PCs", + vector_field_class: Optional[scVectorField.BaseVectorField] = None, + method: str = "analytical", + store_in_adata: bool = True, **kwargs, ): """Calculate Hessian for each cell with the reconstructed vector field. @@ -380,69 +356,44 @@ def hessian( supported shortly. Note that we compute the Hessian for the RKHS kernel vector field analytically, which is much more computationally efficient than the numerical method. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field in `.uns`. - regulators: list - The list of genes that will be used as regulators when calculating the cell-wise Hessian matrix. The + Args: + adata: AnnData object that contains the reconstructed vector field in `.uns`. + regulators: The list of genes that will be used as regulators when calculating the cell-wise Hessian matrix. The Hessian is the matrix consisting of secondary partial derivatives of the vector field wrt gene expressions. It can be used to evaluate the change in velocities of effectors (see below) as the expressions of regulators and co-regulators increase. The regulators/co-regulators are the denominators of the partial derivatives. - coregulators: list - The list of genes that will be used as regulators when calculating the cell-wise Hessian matrix. The + coregulators: The list of genes that will be used as regulators when calculating the cell-wise Hessian matrix. The Hessian is the matrix consisting of secondary partial derivatives of the vector field wrt gene expressions. It can be used to evaluate the change in velocities of effectors (see below) as the expressions of regulators and co-regulators increase. The regulators/co-regulators are the denominators of the partial derivatives. - effector: list or None (default: None) - The target gene that will be used as effector when calculating the cell-wise Hessian matrix. Effector + effector: The target gene that will be used as effector when calculating the cell-wise Hessian matrix. Effector must be a single gene. The effector is the numerator of the partial derivatives. - cell_idx: list or None (default: None) - A list of cell index (or boolean flags) for which the Hessian is calculated. + cell_idx: A list of cell index (or boolean flags) for which the Hessian is calculated. If `None`, all or a subset of sampled cells are used. sampling: {None, 'random', 'velocity', 'trn'}, (default: None) See specific information on these methods in `.tl.sample`. If `None`, all cells are used. - sample_ncells: int (default: 1000) - The number of cells to be sampled. If `sampling` is None, this parameter is ignored. - basis: str (default: 'pca') - The embedding data in which the vector field was reconstructed. If `None`, use the vector field function + sample_ncells: The number of cells to be sampled. If `sampling` is None, this parameter is ignored. + basis: The embedding data in which the vector field was reconstructed. If `None`, use the vector field function that was reconstructed directly from the original unreduced gene expression space. - Qkey: str (default: 'PCs') - The key of the PCA loading matrix in `.uns`. - vector_field_class: :class:`~scVectorField.vectorfield` - If not `None`, the Hessian will be computed using this class instead of the vector field stored in adata. - method: str (default: 'analytical') - The method that will be used for calculating Hessian, either `'analytical'` or `'numerical'`. + Qkey: The key of the PCA loading matrix in `.uns`. + vector_field_class: If not `None`, the Hessian will be computed using this class instead of the vector field stored in adata. + method: The method that will be used for calculating Hessian, either `'analytical'` or `'numerical'`. `'analytical'` method uses the analytical expressions for calculating Hessian while `'numerical'` method uses numdifftools, a numerical differentiation tool, for computing Hessian. `'analytical'` method is much more efficient. - cores: int (default: 1) - Number of cores to calculate Hessian. Currently note used. - kwargs: - Any additional keys that will be passed to elementwise_hessian_transformation function. + cores: Number of cores to calculate Hessian. Currently note used. + kwargs: Any additional keys that will be passed to elementwise_hessian_transformation function. - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `'Hessian'` key in the `.uns`. This is a 4-dimensional tensor with + Returns: + AnnData object that is updated with the `'Hessian'` key in the `.uns`. This is a 4-dimensional tensor with dimensions 1 (n_effector) x n_regulators x n_coregulators x n_obs. """ if vector_field_class is None: - vf_dict = get_vf_dict(adata, basis=basis) - if "method" not in vf_dict.keys(): - vf_dict["method"] = "sparsevfc" - if vf_dict["method"].lower() == "sparsevfc": - vector_field_class = SvcVectorField() - vector_field_class.from_adata(adata, basis=basis) - elif vf_dict["method"].lower() == "dynode": - vf_dict["parameters"]["load_model_from_buffer"] = True - vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) - else: - raise ValueError("current only support two methods, SparseVFC and dynode") + vector_field_class = get_vf_class(adata, basis=basis) if basis == "umap": cell_idx = np.arange(adata.n_obs) @@ -547,12 +498,12 @@ def hessian( def laplacian( - adata, - hkey="hessian_pca", - basis="pca", - Qkey="PCs", - vector_field_class=None, - method="analytical", + adata: AnnData, + hkey: str = "hessian_pca", + basis: str = "pca", + Qkey: str = "PCs", + vector_field_class: Optional[scVectorField.BaseVectorField] = None, + method: str = "analytical", **kwargs, ): """Calculate Laplacian for each target gene in each cell with the reconstructed vector field. @@ -562,31 +513,20 @@ def laplacian( supported shortly. Note we compute the Lapalacian for the RKHS kernel vector field analytically, which is much more computationally efficient than the numerical method. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field in `.uns` and the `hkey` (the hessian matrix). - basis: str (default: 'pca') - The embedding data in which the vector field was reconstructed. If `None`, use the vector field function + Args: + adata: AnnData object that contains the reconstructed vector field in `.uns` and the `hkey` (the hessian matrix). + basis: The embedding data in which the vector field was reconstructed. If `None`, use the vector field function that was reconstructed directly from the original unreduced gene expression space. - Qkey: str (default: 'PCs') - The key of the PCA loading matrix in `.uns`. - vector_field_class: :class:`~scVectorField.vectorfield` - If not `None`, the Hessian will be computed using this class instead of the vector field stored in adata. - method: str (default: 'analytical') - The method that will be used for calculating Laplacian, either `'analytical'` or `'numerical'`. + Qkey: The key of the PCA loading matrix in `.uns`. + vector_field_class: If not `None`, the Hessian will be computed using this class instead of the vector field stored in adata. + method: The method that will be used for calculating Laplacian, either `'analytical'` or `'numerical'`. `'analytical'` method uses the analytical expressions for calculating Laplacian while `'numerical'` method uses numdifftools, a numerical differentiation tool, for computing Laplacian. `'analytical'` method is much more efficient. - cores: int (default: 1) - Number of cores to calculate Hessian. Currently note used. - kwargs: - Any additional keys that will be passed to elementwise_hessian_transformation function. + kwargs: Any additional keys that will be passed to elementwise_hessian_transformation function. - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `'Laplacian'` key in the `.obs` and `obsm`. The first one is the + Returns: + AnnData object that is updated with the `'Laplacian'` key in the `.obs` and `obsm`. The first one is the norm of the Laplacian for all target genes in a cell while the second one is the vector of Laplacian for all target genes in each cell. """ @@ -601,17 +541,7 @@ def laplacian( H = adata.uns[hkey]["hessian"] if vector_field_class is None: - vf_dict = get_vf_dict(adata, basis=basis) - if "method" not in vf_dict.keys(): - vf_dict["method"] = "sparsevfc" - if vf_dict["method"].lower() == "sparsevfc": - vector_field_class = SvcVectorField() - vector_field_class.from_adata(adata, basis=basis) - elif vf_dict["method"].lower() == "dynode": - vf_dict["parameters"]["load_model_from_buffer"] = True - vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) - else: - raise ValueError("current only support two methods, SparseVFC and dynode") + vector_field_class = get_vf_class(adata, basis=basis) Laplacian_func = vector_field_class.get_Laplacian(method=method) Ls = Laplacian_func(H) @@ -643,20 +573,20 @@ def laplacian( def sensitivity( - adata, - regulators=None, - effectors=None, - cell_idx=None, - sampling=None, - sample_ncells=1000, - basis="pca", - Qkey="PCs", - vector_field_class=None, - method="analytical", - projection_method="from_jacobian", - store_in_adata=True, + adata: AnnData, + regulators: Optional[List] = None, + effectors: Optional[List] = None, + cell_idx: Optional[List] = None, + sampling: Optional[Literal['random', 'velocity', 'trn']] = None, + sample_ncells: int = 1000, + basis: str = "pca", + Qkey: str = "PCs", + vector_field_class: Optional[scVectorField.BaseVectorField] = None, + method: str = "analytical", + projection_method: str = "from_jacobian", + store_in_adata: bool = True, **kwargs, -): +) -> Union[AnnData, Dict]: """Calculate Sensitivity matrix for each cell with the reconstructed vector field. If the vector field was reconstructed from the reduced PCA space, the Sensitivity matrix will then be inverse @@ -664,56 +594,41 @@ def sensitivity( supported shortly. Note that we compute the Sensitivity for the RKHS kernel vector field analytically, which is much more computationally efficient than the numerical method. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field in `.uns`. - regulators: list - The list of genes that will be used as regulators when calculating the cell-wise Jacobian matrix. The + Args: + adata: AnnData object that contains the reconstructed vector field in `.uns`. + regulators: The list of genes that will be used as regulators when calculating the cell-wise Jacobian matrix. The Jacobian is the matrix consisting of partial derivatives of the vector field wrt gene expressions. It can be used to evaluate the change in velocities of effectors (see below) as the expressions of regulators increase. The regulators are the denominators of the partial derivatives. - effectors: list or None (default: None) - The list of genes that will be used as effectors when calculating the cell-wise Jacobian matrix. The + effectors: The list of genes that will be used as effectors when calculating the cell-wise Jacobian matrix. The effectors are the numerators of the partial derivatives. - cell_idx: list or None (default: None) - A list of cell index (or boolean flags) for which the jacobian is calculated. + cell_idx: A list of cell index (or boolean flags) for which the jacobian is calculated. If `None`, all or a subset of sampled cells are used. sampling: {None, 'random', 'velocity', 'trn'}, (default: None) See specific information on these methods in `.tl.sample`. If `None`, all cells are used. - sample_ncells: int (default: 1000) - The number of cells to be sampled. If `sampling` is None, this parameter is ignored. - basis: str (default: 'pca') - The embedding data in which the vector field was reconstructed. If `None`, use the vector field function + sample_ncells: The number of cells to be sampled. If `sampling` is None, this parameter is ignored. + basis: The embedding data in which the vector field was reconstructed. If `None`, use the vector field function that was reconstructed directly from the original unreduced gene expression space. - Qkey: str (default: 'PCs') - The key of the PCA loading matrix in `.uns`. - vector_field_class: :class:`~scVectorField.vectorfield` - If not `None`, the jacobian will be computed using this class instead of the vector field stored in adata. - method: str (default: 'analytical') - The method that will be used for calculating Jacobian, either `'analytical'` or `'numerical'`. + Qkey: The key of the PCA loading matrix in `.uns`. + vector_field_class: If not `None`, the jacobian will be computed using this class instead of the vector field stored in adata. + method: The method that will be used for calculating Jacobian, either `'analytical'` or `'numerical'`. `'analytical'` method uses the analytical expressions for calculating Jacobian while `'numerical'` method uses numdifftools, a numerical differentiation tool, for computing Jacobian. `'analytical'` method is much more efficient. - projection_method: str (default: 'from_jacobian') - The method that will be used to project back to original gene expression space for calculating gene-wise + projection_method: The method that will be used to project back to original gene expression space for calculating gene-wise sensitivity matrix: (1) 'from_jacobian': first calculate jacobian matrix and then calculate sensitivity matrix. This method will take the combined regulator + effectors gene set for calculating a square Jacobian matrix - required for the sensitivyt matrix calculation. + required for the sensitivity matrix calculation. (2) 'direct': The sensitivity matrix on low dimension will first calculated and then projected back to original gene expression space in a way that is similar to the gene-wise jacobian calculation. - cores: int (default: 1) - Number of cores to calculate Jacobian. If cores is set to be > 1, multiprocessing will be used to + cores: Number of cores to calculate Jacobian. If cores is set to be > 1, multiprocessing will be used to parallel the Jacobian calculation. - kwargs: - Any additional keys that will be passed to elementwise_jacobian_transformation function. + kwargs: Any additional keys that will be passed to elementwise_jacobian_transformation function. - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `'sensitivity'` key in the `.uns`. This is a 3-dimensional tensor + Returns: + adata: AnnData object that is updated with the `'sensitivity'` key in the `.uns`. This is a 3-dimensional tensor with dimensions n_obs x n_regulators x n_effectors. """ @@ -722,17 +637,7 @@ def sensitivity( list(np.unique(effectors)) if effectors is not None else None, ) if vector_field_class is None: - vf_dict = get_vf_dict(adata, basis=basis) - if "method" not in vf_dict.keys(): - vf_dict["method"] = "sparsevfc" - if vf_dict["method"].lower() == "sparsevfc": - vector_field_class = SvcVectorField() - vector_field_class.from_adata(adata, basis=basis) - elif vf_dict["method"].lower() == "dynode": - vf_dict["parameters"]["load_model_from_buffer"] = True - vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) - else: - raise ValueError("current only support two methods, SparseVFC and dynode") + vector_field_class = get_vf_class(adata, basis=basis) if basis == "umap": cell_idx = np.arange(adata.n_obs) @@ -841,52 +746,29 @@ def sensitivity( def acceleration( - adata, - basis="umap", - vector_field_class=None, - Qkey="PCs", - method="analytical", + adata: AnnData, + basis: str = "umap", + vector_field_class: Optional[scVectorField.BaseVectorField] = None, + Qkey: str = "PCs", + method: str = "analytical", **kwargs, ): - """Calculate acceleration for each cell with the reconstructed vector field function. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field function in the `uns` attribute. - basis: `str` or None (default: `umap`) - The embedding data in which the vector field was reconstructed. - vector_field_class: :class:`~scVectorField.vectorfield` - If not None, the divergene will be computed using this class instead of the vector field stored in adata. - Qkey: str (default: 'PCs') - The key of the PCA loading matrix in `.uns`. - method: str (default: 'analytical') - The method that will be used for calculating acceleration field, either `'analytical'` or `'numerical'`. + """Calculate acceleration for each cell with the reconstructed vector field function. AnnData object is updated with the `'acceleration'` key in the `.obs` as well as .obsm. If basis is `pca`, acceleration matrix will be inverse transformed back to original high dimension space. + + Args: + adata: AnnData object that contains the reconstructed vector field function in the `uns` attribute. + basis: The embedding data in which the vector field was reconstructed. + vector_field_class: If not None, the divergene will be computed using this class instead of the vector field stored in adata. + Qkey: The key of the PCA loading matrix in `.uns`. + method: The method that will be used for calculating acceleration field, either `'analytical'` or `'numerical'`. `'analytical'` method uses the analytical expressions for calculating acceleration field while `'numerical'` method uses numdifftools, a numerical differentiation tool, for computing acceleration. `'analytical'` method is much more efficient. - kwargs: - Any additional keys that will be passed to vector_field_class.compute_acceleration function. - - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `'acceleration'` key in the `.obs` as well as .obsm. If basis is - `pca`, acceleration matrix will be inverse transformed back to original high dimension space. + kwargs: Any additional keys that will be passed to vector_field_class.compute_acceleration function. """ if vector_field_class is None: - vf_dict = get_vf_dict(adata, basis=basis) - if "method" not in vf_dict.keys(): - vf_dict["method"] = "sparsevfc" - if vf_dict["method"].lower() == "sparsevfc": - vector_field_class = SvcVectorField() - vector_field_class.from_adata(adata, basis=basis) - elif vf_dict["method"].lower() == "dynode": - vf_dict["parameters"]["load_model_from_buffer"] = True - vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) - else: - raise ValueError("current only support two methods, SparseVFC and dynode") + vector_field_class = get_vf_class(adata, basis=basis) X, V = vector_field_class.get_data() @@ -921,54 +803,31 @@ def acceleration( def curvature( adata: AnnData, basis: str = "pca", - vector_field_class: scVectorField.BaseVectorField = None, + vector_field_class: Optional[scVectorField.BaseVectorField] = None, formula: int = 2, Qkey: str = "PCs", method: str = "analytical", **kwargs, ): - """Calculate curvature for each cell with the reconstructed vector field function. + """Calculate curvature for each cell with the reconstructed vector field function. AnnData object that is updated with the `curvature` key in the `.obs`. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field function in the `uns` attribute. - basis: - The embedding data in which the vector field was reconstructed. - vector_field_class: :class:`~scVectorField.vectorfield` - If not None, the divergene will be computed using this class instead of the vector field stored in adata. - formula: int (default: 2) - Which formula of curvature will be used, there are two formulas, so formula can be either `{1, 2}`. By + Args: + adata: AnnData object that contains the reconstructed vector field function in the `uns` attribute. + basis: The embedding data in which the vector field was reconstructed. + vector_field_class: If not None, the divergene will be computed using this class instead of the vector field stored in adata. + formula: Which formula of curvature will be used, there are two formulas, so formula can be either `{1, 2}`. By default it is 2 and returns both the curvature vectors and the norm of the curvature. The formula one only gives the norm of the curvature. - Qkey: str (default: 'PCs') - The key of the PCA loading matrix in `.uns`. - method: str (default: 'analytical') - The method that will be used for calculating curvature field, either `'analytical'` or `'numerical'`. + Qkey: The key of the PCA loading matrix in `.uns`. + method: The method that will be used for calculating curvature field, either `'analytical'` or `'numerical'`. `'analytical'` method uses the analytical expressions for calculating curvature while `'numerical'` method uses numdifftools, a numerical differentiation tool, for computing curvature. `'analytical'` method is much more efficient. - kwargs: - Any additional keys that will be passed to vector_field_class.compute_curvature function. - - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `curvature` key in the `.obs`. + kwargs: Any additional keys that will be passed to vector_field_class.compute_curvature function. """ if vector_field_class is None: - vf_dict = get_vf_dict(adata, basis=basis) - if "method" not in vf_dict.keys(): - vf_dict["method"] = "sparsevfc" - if vf_dict["method"].lower() == "sparsevfc": - vector_field_class = SvcVectorField() - vector_field_class.from_adata(adata, basis=basis) - elif vf_dict["method"].lower() == "dynode": - vf_dict["parameters"]["load_model_from_buffer"] = True - vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) - else: - raise ValueError("current only support two methods, SparseVFC and dynode") + vector_field_class = get_vf_class(adata, basis=basis) if formula not in [1, 2]: raise ValueError( @@ -999,11 +858,12 @@ def curvature( ) -def torsion(adata, basis="umap", vector_field_class=None, **kwargs): - """Calculate torsion for each cell with the reconstructed vector field function. +def torsion( + adata: AnnData, basis: str = "umap", vector_field_class: Optional[scVectorField.BaseVectorField] = None, **kwargs +): + """Calculate torsion for each cell with the reconstructed vector field function. AnnData object that is updated with the `torsion` key in the .obs. - Parameters - ---------- + Args: adata: :class:`~anndata.AnnData` AnnData object that contains the reconstructed vector field function in the `uns` attribute. basis: str or None (default: `umap`) @@ -1013,11 +873,6 @@ def torsion(adata, basis="umap", vector_field_class=None, **kwargs): kwargs: Any additional keys that will be passed to vector_field_class.compute_torsion function. - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `torsion` key in the .obs. - Examples -------- >>> adata = dyn.sample_data.hematopoiesis() @@ -1035,17 +890,7 @@ def torsion(adata, basis="umap", vector_field_class=None, **kwargs): """ if vector_field_class is None: - vf_dict = get_vf_dict(adata, basis=basis) - if "method" not in vf_dict.keys(): - vf_dict["method"] = "sparsevfc" - if vf_dict["method"].lower() == "sparsevfc": - vector_field_class = SvcVectorField() - vector_field_class.from_adata(adata, basis=basis) - elif vf_dict["method"].lower() == "dynode": - vf_dict["parameters"]["load_model_from_buffer"] = True - vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) - else: - raise ValueError("current only support two methods, SparseVFC and dynode") + vector_field_class = get_vf_class(adata, basis=basis) X, V = vector_field_class.get_data() torsion_mat = vector_field_class.compute_torsion(X=X, **kwargs) @@ -1057,44 +902,28 @@ def torsion(adata, basis="umap", vector_field_class=None, **kwargs): adata.uns[torsion_key] = torsion_mat -def curl(adata, basis="umap", vector_field_class=None, method="analytical", **kwargs): - """Calculate Curl for each cell with the reconstructed vector field function. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field function in the `uns` attribute. - basis: str or None (default: `umap`) - The embedding data in which the vector field was reconstructed. - vector_field_class: :class:`~.scVectorField.vectorfield` - If not None, the divergene will be computed using this class instead of the vector field stored in adata. - method: str (default: `analytical`) - The method that will be used for calculating curl, either `analytical` or `numeric`. `analytical` +def curl( + adata: AnnData, + basis: str = "umap", + vector_field_class: Optional[scVectorField.BaseVectorField] = None, + method: str = "analytical", + **kwargs, +): + """Calculate Curl for each cell with the reconstructed vector field function. AnnData object is updated with the `'curl'` information in the `. + obs`. When vector field has three dimension, adata.obs['curl'] (magnitude of curl) and adata.obsm['curl'] (curl vector) will be added; when vector field has two dimension, only adata.obs['curl'] (magnitude of curl) will be provided. + + Args: + adata: AnnData object that contains the reconstructed vector field function in the `uns` attribute. + basis: The embedding data in which the vector field was reconstructed. + vector_field_class: If not None, the divergene will be computed using this class instead of the vector field stored in adata. + method: The method that will be used for calculating curl, either `analytical` or `numeric`. `analytical` method will use the analytical form of the reconstructed vector field for calculating curl while `numeric` method will use numdifftools for calculation. `analytical` method is much more efficient. - kwargs: - Any additional keys that will be passed to vector_field_class.compute_curl function. - - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `'curl'` information in the `.obs`. When vector field has three - dimension, adata.obs['curl'] (magnitude of curl) and adata.obsm['curl'] (curl vector) will be added; when - vector field has two dimension, only adata.obs['curl'] (magnitude of curl) will be provided. + kwargs: Any additional keys that will be passed to vector_field_class.compute_curl function. """ if vector_field_class is None: - vf_dict = get_vf_dict(adata, basis=basis) - if "method" not in vf_dict.keys(): - vf_dict["method"] = "sparsevfc" - if vf_dict["method"].lower() == "sparsevfc": - vector_field_class = SvcVectorField() - vector_field_class.from_adata(adata, basis=basis) - elif vf_dict["method"].lower() == "dynode": - vf_dict["parameters"]["load_model_from_buffer"] = True - vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) - else: - raise ValueError("current only support two methods, SparseVFC and dynode") + vector_field_class = get_vf_class(adata, basis=basis) X, V = vector_field_class.get_data() curl = vector_field_class.compute_curl(X=X, method=method, **kwargs) @@ -1109,53 +938,39 @@ def curl(adata, basis="umap", vector_field_class=None, method="analytical", **kw def divergence( - adata, - cell_idx=None, - sampling=None, - sample_ncells=1000, - basis="pca", + adata: AnnData, + cell_idx: Optional[List] = None, + sampling: Optional[Literal['random', 'velocity', 'trn']] = None, + sample_ncells: int = 1000, + basis: str = "pca", vector_field_class=None, - method="analytical", - store_in_adata=True, + method: str = "analytical", + store_in_adata: bool = True, **kwargs, -): - """Calculate divergence for each cell with the reconstructed vector field function. +) -> Optional[np.ndarray]: + """Calculate divergence for each cell with the reconstructed vector field function. Either AnnData object is updated with the `'divergence'` key in the `.obs` or the divergence is returned as a numpy array. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field function in the `uns` attribute. - basis: str or None (default: `umap`) - The embedding data in which the vector field was reconstructed. - vector_field_class: :class:`scVectorField.vectorfield` - If not None, the divergene will be computed using this class instead of the vector field stored in adata. - method: str (default: `analytical`) - The method that will be used for calculating divergence, either `analytical` or `numeric`. `analytical` + Args: + adata: AnnData object that contains the reconstructed vector field function in the `uns` attribute. + cell_idx: A list of cell index (or boolean flags) for which the jacobian is calculated. + sampling: {None, 'random', 'velocity', 'trn'}, (default: None) + See specific information on these methods in `.tl.sample`. + If `None`, all cells are used. + sample_ncells: The number of cells to be sampled. If `sampling` is None, this parameter is ignored. + basis: The embedding data in which the vector field was reconstructed. + vector_field_class: If not None, the divergene will be computed using this class instead of the vector field stored in adata. + method: The method that will be used for calculating divergence, either `analytical` or `numeric`. `analytical` method will use the analytical form of the reconstructed vector field for calculating divergence while `numeric` method will use numdifftools for calculation. `analytical` method is much more efficient. - store_in_adata: bool (default: `True`) - Whether to store the divergence result in adata. - kwargs: - Any additional keys that will be passed to vector_field_class.compute_divergence function. + store_in_adata: Whether to store the divergence result in adata. + kwargs: Any additional keys that will be passed to vector_field_class.compute_divergence function. - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `'divergence'` key in the `.obs`. + Returns: + the divergence is returned as an np.ndarray if store_in_adata is False. """ if vector_field_class is None: - vf_dict = get_vf_dict(adata, basis=basis) - if "method" not in vf_dict.keys(): - vf_dict["method"] = "sparsevfc" - if vf_dict["method"].lower() == "sparsevfc": - vector_field_class = SvcVectorField() - vector_field_class.from_adata(adata, basis=basis) - elif vf_dict["method"].lower() == "dynode": - vf_dict["parameters"]["load_model_from_buffer"] = True - vector_field_class = vf_dict["dynode_object"] # dynode_vectorfield(**vf_dict["parameters"]) - else: - raise ValueError("current only support two methods, SparseVFC and dynode") + vector_field_class = get_vf_class(adata, basis=basis) if basis == "umap": cell_idx = np.arange(adata.n_obs) @@ -1190,883 +1005,3 @@ def divergence( adata.obs[div_key] = Div else: return div - - -def rank_genes( - adata, - arr_key, - groups=None, - genes=None, - abs=False, - normalize=False, - fcn_pool=lambda x: np.mean(x, axis=0), - dtype=None, - output_values=False, -): - """Rank gene's absolute, positive, negative vector field metrics by different cell groups. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the array to be sorted in `.var` or `.layer`. - arr_key: str or :class:`~numpy.ndarray` - The key of the to-be-ranked array stored in `.var` or or `.layer`. - If the array is found in `.var`, the `groups` argument will be ignored. - If a numpy array is passed, it is used as the array to be ranked and must - be either an 1d array of length `.n_var`, or a `.n_obs`-by-`.n_var` 2d array. - groups: str or None (default: None) - Cell groups used to group the array. - genes: list or None (default: None) - The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. - abs: bool (default: False) - When pooling the values in the array (see below), whether to take the absolute values. - normalize: bool (default: False) - Whether normalize the array across all cells first, if the array is 2d. - fcn_pool: callable (default: numpy.mean(x, axis=0)) - The function used to pool values in the to-be-ranked array if the array is 2d. - output_values: bool (default: False) - Whether output the values along with the rankings. - - Returns - ------- - ret_dict: dict - A dictionary of gene names and values based on which the genes are sorted for each cell group. - """ - - genes, arr = get_rank_array( - adata, - arr_key, - genes=genes, - abs=abs, - dtype=dtype, - ) - - if arr.ndim > 1: - if normalize: - arr_max = np.max(np.abs(arr), axis=0) - arr = arr / arr_max - arr[np.isnan(arr)] = 0 - if groups is not None: - if type(groups) is str and groups in adata.obs.keys(): - grps = np.array(adata.obs[groups]) - elif isarray(groups): - grps = np.array(groups) - else: - raise Exception(f"The group information {groups} you provided is not in your adata object.") - arr_dict = {} - for g in np.unique(grps): - arr_dict[g] = fcn_pool(arr[grps == g]) - else: - arr_dict = {"all": fcn_pool(arr)} - else: - arr_dict = {"all": arr} - - ret_dict = {} - var_names = np.array(index_gene(adata, adata.var_names, genes)) - for g, arr in arr_dict.items(): - if ismatrix(arr): - arr = arr.A.flatten() - glst, sarr = list_top_genes(arr, var_names, None, return_sorted_array=True) - # ret_dict[g] = {glst[i]: sarr[i] for i in range(len(glst))} - ret_dict[g] = glst - if output_values: - ret_dict[g + "_values"] = sarr - return pd.DataFrame(data=ret_dict) - - -def rank_cells( - adata, - arr_key, - groups=None, - genes=None, - abs=False, - fcn_pool=lambda x: np.mean(x, axis=0), - dtype=None, - output_values=False, -): - """Rank cell's absolute, positive, negative vector field metrics by different gene groups. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the array to be sorted in `.var` or `.layer`. - arr_key: str or :class:`~numpy.ndarray` - The key of the to-be-ranked array stored in `.var` or or `.layer`. - If the array is found in `.var`, the `groups` argument will be ignored. - If a numpy array is passed, it is used as the array to be ranked and must - be either an 1d array of length `.n_var`, or a `.n_obs`-by-`.n_var` 2d array. - groups: str or None (default: None) - Gene groups used to group the array. - genes: list or None (default: None) - The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. - abs: bool (default: False) - When pooling the values in the array (see below), whether to take the absolute values. - fcn_pool: callable (default: numpy.mean(x, axis=0)) - The function used to pool values in the to-be-ranked array if the array is 2d. - Returns - ------- - ret_dict: dict - A dictionary of cells names and values based on which the genes are sorted for each gene group. - """ - - genes, arr = get_rank_array( - adata, - arr_key, - genes=genes, - abs=abs, - dtype=dtype, - ) - arr = arr.T - - if arr.ndim > 1: - if groups is not None: - if type(groups) is str and groups in adata.var.keys(): - grps = np.array(adata.var[groups]) # check this - elif isarray(groups): - grps = np.array(groups) - else: - raise Exception(f"The group information {groups} you provided is not in your adata object.") - arr_dict = {} - for g in np.unique(grps): - arr_dict[g] = fcn_pool(arr[grps == g]) - else: - arr_dict = {"all": fcn_pool(arr)} - else: - arr_dict = {"all": arr} - - ret_dict = {} - cell_names = np.array(adata.obs_names) - for g, arr in arr_dict.items(): - if ismatrix(arr): - arr = arr.A.flatten() - glst, sarr = list_top_genes(arr, cell_names, None, return_sorted_array=True) - # ret_dict[g] = {glst[i]: sarr[i] for i in range(len(glst))} - ret_dict[g] = glst - if output_values: - ret_dict[g + "_values"] = sarr - return pd.DataFrame(data=ret_dict) - - -def rank_cell_groups( - adata, - arr_key, - groups=None, - genes=None, - abs=False, - fcn_pool=lambda x: np.mean(x, axis=0), - dtype=None, - output_values=False, -): - """Rank cell's absolute, positive, negative vector field metrics by different gene groups. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the array to be sorted in `.var` or `.layer`. - arr_key: str or :class:`~numpy.ndarray` - The key of the to-be-ranked array stored in `.var` or or `.layer`. - If the array is found in `.var`, the `groups` argument will be ignored. - If a numpy array is passed, it is used as the array to be ranked and must - be either an 1d array of length `.n_var`, or a `.n_obs`-by-`.n_var` 2d array. - groups: str or None (default: None) - Gene groups used to group the array. - genes: list or None (default: None) - The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. - abs: bool (default: False) - When pooling the values in the array (see below), whether to take the absolute values. - fcn_pool: callable (default: numpy.mean(x, axis=0)) - The function used to pool values in the to-be-ranked array if the array is 2d. - Returns - ------- - ret_dict: dict - A dictionary of cells names and values based on which the genes are sorted for each gene group. - """ - - genes, arr = get_rank_array( - adata, - arr_key, - genes=genes, - abs=abs, - dtype=dtype, - ) - arr = arr.T - - if arr.ndim > 1: - if groups is not None: - if type(groups) is str and groups in adata.var.keys(): - grps = np.array(adata.var[groups]) # check this - elif isarray(groups): - grps = np.array(groups) - else: - raise Exception(f"The group information {groups} you provided is not in your adata object.") - arr_dict = {} - for g in np.unique(grps): - arr_dict[g] = fcn_pool(arr[grps == g]) - else: - arr_dict = {"all": fcn_pool(arr)} - else: - arr_dict = {"all": arr} - - ret_dict = {} - cell_names = np.array(adata.obs_names) - for g, arr in arr_dict.items(): - if ismatrix(arr): - arr = arr.A.flatten() - glst, sarr = list_top_genes(arr, cell_names, None, return_sorted_array=True) - # ret_dict[g] = {glst[i]: sarr[i] for i in range(len(glst))} - ret_dict[g] = glst - if output_values: - ret_dict[g + "_values"] = sarr - return pd.DataFrame(data=ret_dict) - - -def rank_expression_genes(adata, ekey="M_s", prefix_store="rank", **kwargs): - """Rank genes based on their expression values for each cell group. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the normalized or locally smoothed expression. - ekey: str (default: 'M_s') - The expression key, can be any properly normalized layers, e.g. M_s, M_u, M_t, M_n. - prefix_store: str (default: 'rank') - The prefix added to the key for storing the returned in adata. - kwargs: - additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: - group: str or None (default: None) - The cell group that speed ranking will be grouped-by. - genes: list or None (default: None) - The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. - abs: bool (default: False) - When pooling the values in the array (see below), whether to take the absolute values. - normalize: bool (default: False) - Whether normalize the array across all cells first, if the array is 2d. - fcn_pool: callable (default: numpy.mean(x, axis=0)) - The function used to pool values in the to-be-ranked array if the array is 2d. - output_values: bool (default: False) - Whether output the values along with the rankings. - - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object which has the rank dictionary for expression in `.uns`. - """ - - rdict = rank_genes(adata, ekey, **kwargs) - adata.uns[prefix_store + "_" + ekey] = rdict - return adata - - -def rank_velocity_genes(adata, vkey="velocity_S", prefix_store="rank", **kwargs): - """Rank genes based on their raw and absolute velocities for each cell group. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the gene-wise velocities. - vkey: str (default: 'velocity_S') - The velocity key. - prefix_store: str (default: 'rank') - The prefix added to the key for storing the returned in adata. - kwargs: - additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: - group: str or None (default: None) - The cell group that speed ranking will be grouped-by. - genes: list or None (default: None) - The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. - abs: bool (default: False) - When pooling the values in the array (see below), whether to take the absolute values. - normalize: bool (default: False) - Whether normalize the array across all cells first, if the array is 2d. - fcn_pool: callable (default: numpy.mean(x, axis=0)) - The function used to pool values in the to-be-ranked array if the array is 2d. - output_values: bool (default: False) - Whether output the values along with the rankings. - - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object which has the rank dictionary for velocities in `.uns`. - """ - rdict = rank_genes(adata, vkey, **kwargs) - rdict_abs = rank_genes(adata, vkey, abs=True, **kwargs) - adata.uns[prefix_store + "_" + vkey] = rdict - adata.uns[prefix_store + "_abs_" + vkey] = rdict_abs - return adata - - -def rank_divergence_genes( - adata, - jkey="jacobian_pca", - genes=None, - prefix_store="rank_div_gene", - **kwargs, -): - """Rank genes based on their diagonal Jacobian for each cell group. - Be aware that this 'divergence' refers to the diagonal elements of a gene-wise - Jacobian, rather than its trace, which is the common definition of the divergence. - - Run .vf.jacobian and set store_in_adata=True before using this function. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field in the `.uns` attribute. - jkey: str (default: 'jacobian_pca') - The key in .uns of the cell-wise Jacobian matrix. - genes: list or None (default: None) - A list of names for genes of interest. - prefix_store: str (default: 'rank') - The prefix added to the key for storing the returned ranking info in adata. - kwargs: - additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: - group: str or None (default: None) - The cell group that speed ranking will be grouped-by. - genes: list or None (default: None) - The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. - abs: bool (default: False) - When pooling the values in the array (see below), whether to take the absolute values. - normalize: bool (default: False) - Whether normalize the array across all cells first, if the array is 2d. - fcn_pool: callable (default: numpy.mean(x, axis=0)) - The function used to pool values in the to-be-ranked array if the array is 2d. - output_values: bool (default: False) - Whether output the values along with the rankings. - - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object which has the rank dictionary for diagonal jacobians in `.uns`. - """ - - if jkey not in adata.uns_keys(): - raise Exception(f"The provided dictionary key {jkey} is not in .uns.") - - reg = [x for x in adata.uns[jkey]["regulators"]] - eff = [x for x in adata.uns[jkey]["effectors"]] - if reg != eff: - raise Exception("The Jacobian should have the same regulators and effectors.") - else: - Genes = adata.uns[jkey]["regulators"] - cell_idx = adata.uns[jkey]["cell_idx"] - div = np.einsum("iij->ji", adata.uns[jkey]["jacobian_gene"]) - Div = create_layer(adata, div, genes=Genes, cells=cell_idx, dtype=np.float32) - - if genes is not None: - Genes = list(set(Genes).intersection(genes)) - - rdict = rank_genes( - adata, - Div, - fcn_pool=lambda x: np.nanmean(x, axis=0), - genes=Genes, - **kwargs, - ) - adata.uns[prefix_store + "_" + jkey] = rdict - return rdict - - -def rank_s_divergence_genes( - adata, - skey="sensitivity_pca", - genes=None, - prefix_store="rank_s_div_gene", - **kwargs, -): - """Rank genes based on their diagonal Sensitivity for each cell group. - Be aware that this 'divergence' refers to the diagonal elements of a gene-wise - Sensitivity, rather than its trace, which is the common definition of the divergence. - - Run .vf.sensitivity and set store_in_adata=True before using this function. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field in the `.uns` attribute. - skey: str (default: 'sensitivity_pca') - The key in .uns of the cell-wise sensitivity matrix. - genes: list or None (default: None) - A list of names for genes of interest. - prefix_store: str (default: 'rank') - The prefix added to the key for storing the returned ranking info in adata. - kwargs: - additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: - group: str or None (default: None) - The cell group that speed ranking will be grouped-by. - genes: list or None (default: None) - The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. - abs: bool (default: False) - When pooling the values in the array (see below), whether to take the absolute values. - normalize: bool (default: False) - Whether normalize the array across all cells first, if the array is 2d. - fcn_pool: callable (default: numpy.mean(x, axis=0)) - The function used to pool values in the to-be-ranked array if the array is 2d. - output_values: bool (default: False) - Whether output the values along with the rankings. - - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object which has the rank dictionary for diagonal sensitivity in `.uns`. - """ - - if skey not in adata.uns_keys(): - raise Exception(f"The provided dictionary key {skey} is not in .uns.") - - reg = [x for x in adata.uns[skey]["regulators"]] - eff = [x for x in adata.uns[skey]["effectors"]] - if reg != eff: - raise Exception("The Jacobian should have the same regulators and effectors.") - else: - Genes = adata.uns[skey]["regulators"] - cell_idx = adata.uns[skey]["cell_idx"] - div = np.einsum("iij->ji", adata.uns[skey]["sensitivity_gene"]) - Div = create_layer(adata, div, genes=Genes, cells=cell_idx, dtype=np.float32) - - if genes is not None: - Genes = list(set(Genes).intersection(genes)) - - rdict = rank_genes( - adata, - Div, - fcn_pool=lambda x: np.nanmean(x, axis=0), - genes=Genes, - **kwargs, - ) - adata.uns[prefix_store + "_" + skey] = rdict - return rdict - - -def rank_acceleration_genes(adata, akey="acceleration", prefix_store="rank", **kwargs): - """Rank genes based on their absolute, positive, negative accelerations for each cell group. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field function in the `uns` attribute. - akey: str (default: 'acceleration') - The acceleration key. - prefix_store: str (default: "rank") - The prefix of the key that will be used to store the acceleration rank result. - kwargs: - additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: - group: str or None (default: None) - The cell group that speed ranking will be grouped-by. - genes: list or None (default: None) - The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. - abs: bool (default: False) - When pooling the values in the array (see below), whether to take the absolute values. - normalize: bool (default: False) - Whether normalize the array across all cells first, if the array is 2d. - fcn_pool: callable (default: numpy.mean(x, axis=0)) - The function used to pool values in the to-be-ranked array if the array is 2d. - output_values: bool (default: False) - Whether output the values along with the rankings. - - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `'rank_acceleration'` information in the `.uns`. - """ - - rdict = rank_genes(adata, akey, **kwargs) - rdict_abs = rank_genes(adata, akey, abs=True, **kwargs) - adata.uns[prefix_store + "_" + akey] = rdict - adata.uns[prefix_store + "_abs_" + akey] = rdict_abs - return adata - - -def rank_curvature_genes(adata, ckey="curvature", prefix_store="rank", **kwargs): - """Rank gene's absolute, positive, negative curvature by different cell groups. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field function in the `.uns` attribute. - ckey: `str` (default: `curvature`) - The curvature key. - prefix_store: str (default: "rank") - The prefix of the key that will be used to store the acceleration rank result. - kwargs: - additional keys that will be passed to the `rank_genes` function. It will accept the following arguments: - group: str or None (default: None) - The cell group that speed ranking will be grouped-by. - genes: list or None (default: None) - The gene list that speed will be ranked. If provided, they must overlap the dynamics genes. - abs: bool (default: False) - When pooling the values in the array (see below), whether to take the absolute values. - normalize: bool (default: False) - Whether normalize the array across all cells first, if the array is 2d. - fcn_pool: callable (default: numpy.mean(x, axis=0)) - The function used to pool values in the to-be-ranked array if the array is 2d. - output_values: bool (default: False) - Whether output the values along with the rankings. - - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object that is updated with the `'rank_curvature'` related information in the .uns. - """ - rdict = rank_genes(adata, ckey, **kwargs) - rdict_abs = rank_genes(adata, ckey, abs=True, **kwargs) - adata.uns[prefix_store + "_" + ckey] = rdict - adata.uns[prefix_store + "_abs_" + ckey] = rdict_abs - return adata - - -def rank_jacobian_genes( - adata, - groups=None, - jkey="jacobian_pca", - abs=False, - mode="full reg", - exclude_diagonal=False, - normalize=False, - return_df=False, - **kwargs, -): - """Rank genes or gene-gene interactions based on their Jacobian elements for each cell group. - - Run .vf.jacobian and set store_in_adata=True before using this function. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field in the `.uns` attribute. - groups: str or None (default: None) - Cell groups used to group the Jacobians. - jkey: str (default: 'jacobian_pca') - The key of the stored Jacobians in `.uns`. - abs: bool (default: False) - Whether take the absolute value of the Jacobian. - mode: {'full reg', 'full eff', 'reg', 'eff', 'int', 'switch'} (default: 'full_reg') - The mode of ranking: - (1) `'full reg'`: top regulators are ranked for each effector for each cell group; - (2) `'full eff'`: top effectors are ranked for each regulator for each cell group; - (3) '`reg`': top regulators in each cell group; - (4) '`eff`': top effectors in each cell group; - (5) '`int`': top effector-regulator pairs in each cell group. - (6) '`switch`': top effector-regulator pairs that show mutual inhibition pattern in each cell group. - exclude_diagonal: bool (default: False) - Whether to consider the self-regulation interactions (diagnoal of the jacobian matrix) - normalize: bool (default: False) - Whether normalize the Jacobian across all cells before performing the ranking. - return_df: bool (default: False) - Whether to return the data or to save results in adata object via the key `mode` of adata.uns. - kwargs: - Keyword arguments passed to ranking functions. - - Returns - ------- - rank_info: - different modes return different types of return values - 1. full reg and full eff: - A pandas dataframe containing ranking info based on Jacobian elements - 2. reg eff int: - A dictionary object whose keys correspond to groups, and whose values are - specific rank's pd dataframe - """ - J_dict = adata.uns[jkey] - J = J_dict["jacobian_gene"] - if abs: - J = np.abs(J) - - if normalize: - Jmax = np.max(np.abs(J), axis=2) - for i in range(J.shape[2]): - J[:, :, i] /= Jmax - - if mode == "switch": - J_transpose = J.transpose(1, 0, 2) - J_mul = J * J_transpose - # switch genes will have negative Jacobian between any two gene pairs - # only True * True = 1, so only the gene pair with both negative Jacobian, this will be non-zero: - J = J_mul * (np.sign(J) == -1) * (np.sign(J_transpose) == -1) - - if groups is None: - J_mean = {"all": np.mean(J, axis=2)} - else: - if type(groups) is str and groups in adata.obs.keys(): - grps = np.array(adata.obs[groups]) - elif isarray(groups): - grps = np.array(groups) - else: - raise Exception(f"The group information {groups} you provided is not in your adata object.") - J_mean = average_jacobian_by_group(J, grps[J_dict["cell_idx"]]) - - eff = np.array([x for x in J_dict["effectors"]]) - reg = np.array([x for x in J_dict["regulators"]]) - rank_dict = {} - ov = kwargs.pop("output_values", True) - if mode in ["full reg", "full_reg"]: - for k, J in J_mean.items(): - rank_dict[k] = table_top_genes(J, eff, reg, n_top_genes=None, output_values=ov, **kwargs) - elif mode in ["full eff", "full_eff"]: - for k, J in J_mean.items(): - rank_dict[k] = table_top_genes(J.T, reg, eff, n_top_genes=None, output_values=ov, **kwargs) - elif mode == "reg": - for k, J in J_mean.items(): - if exclude_diagonal: - for i, ef in enumerate(eff): - ii = np.where(reg == ef)[0] - if len(ii) > 0: - J[i, ii] = np.nan - j = np.nanmean(J, axis=0) - if ov: - rank_dict[k], rank_dict[k + "_values"] = list_top_genes( - j, reg, None, return_sorted_array=True, **kwargs - ) - else: - rank_dict[k] = list_top_genes(j, reg, None, **kwargs) - rank_dict = pd.DataFrame(data=rank_dict) - elif mode == "eff": - for k, J in J_mean.items(): - if exclude_diagonal: - for i, re in enumerate(reg): - ii = np.where(eff == re)[0] - if len(ii) > 0: - J[ii, i] = np.nan - j = np.nanmean(J, axis=1) - if ov: - rank_dict[k], rank_dict[k + "_values"] = list_top_genes( - j, eff, None, return_sorted_array=True, **kwargs - ) - else: - rank_dict[k] = list_top_genes(j, eff, None, **kwargs) - rank_dict = pd.DataFrame(data=rank_dict) - elif mode in ["int", "switch"]: - for k, J in J_mean.items(): - ints, vals = list_top_interactions(J, eff, reg, **kwargs) - rank_dict[k] = [] - if ov: - rank_dict[k + "_values"] = [] - for ind, int_val in enumerate(ints): - if not (exclude_diagonal and int_val[0] == int_val[1]): - rank_dict[k].append(int_val[0] + " - " + int_val[1]) - if ov: - rank_dict[k + "_values"].append(vals[ind]) - rank_dict = pd.DataFrame(data=rank_dict) - else: - raise ValueError(f"No such mode as {mode}.") - - if return_df: - return rank_dict - else: - main_info_insert_adata_uns(mode) - adata.uns[mode] = rank_dict - - -def rank_sensitivity_genes( - adata, - groups=None, - skey="sensitivity_pca", - abs=False, - mode="full reg", - exclude_diagonal=False, - **kwargs, -): - """Rank genes or gene-gene interactions based on their sensitivity elements for each cell group. - - Run .vf.sensitivity and set store_in_adata=True before using this function. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field in the `.uns` attribute. - groups: str or None (default: None) - Cell groups used to group the sensitivity. - skey: str (default: 'sensitivity_pca') - The key of the stored sensitivity in `.uns`. - abs: bool (default: False) - Whether or not to take the absolute value of the Jacobian. - mode: {'full reg', 'full eff', 'reg', 'eff', 'int'} (default: 'full_reg') - The mode of ranking: - (1) `'full reg'`: top regulators are ranked for each effector for each cell group; - (2) `'full eff'`: top effectors are ranked for each regulator for each cell group; - (3) '`reg`': top regulators in each cell group; - (4) '`eff`': top effectors in each cell group; - (5) '`int`': top effector-regulator pairs in each cell group. - exclude_diagonal: bool (default: False) - Whether to consider the self-regulation interactions (diagnoal of the jacobian matrix) - kwargs: - Keyword arguments passed to ranking functions. - - Returns - ------- - adata: :class:`~anndata.AnnData` - AnnData object which has the rank dictionary in `.uns`. - """ - S_dict = adata.uns[skey] - S = S_dict["sensitivity_gene"] - if abs: - S = np.abs(S) - if groups is None: - S_mean = {"all": np.mean(S, axis=2)} - else: - if type(groups) is str and groups in adata.obs.keys(): - grps = np.array(adata.obs[groups]) - elif isarray(groups): - grps = np.array(groups) - else: - raise Exception(f"The group information {groups} you provided is not in your adata object.") - S_mean = average_jacobian_by_group(S, grps[S_dict["cell_idx"]]) - - eff = np.array([x for x in S_dict["effectors"]]) - reg = np.array([x for x in S_dict["regulators"]]) - rank_dict = {} - if mode in ["full reg", "full_reg"]: - for k, S in S_mean.items(): - rank_dict[k] = table_top_genes(S, eff, reg, n_top_genes=None, **kwargs) - elif mode in ["full eff", "full_eff"]: - for k, S in S_mean.items(): - rank_dict[k] = table_top_genes(S.T, reg, eff, n_top_genes=None, **kwargs) - elif mode == "reg": - ov = kwargs.pop("output_values", False) - for k, S in S_mean.items(): - if exclude_diagonal: - for i, ef in enumerate(eff): - ii = np.where(reg == ef)[0] - if len(ii) > 0: - S[i, ii] = np.nan - j = np.nanmean(S, axis=0) - if ov: - rank_dict[k], rank_dict[k + "_values"] = list_top_genes( - j, reg, None, return_sorted_array=True, **kwargs - ) - else: - rank_dict[k] = list_top_genes(j, reg, None, **kwargs) - rank_dict = pd.DataFrame(data=rank_dict) - elif mode == "eff": - ov = kwargs.pop("output_values", False) - for k, S in S_mean.items(): - if exclude_diagonal: - for i, re in enumerate(reg): - ii = np.where(eff == re)[0] - if len(ii) > 0: - S[ii, i] = np.nan - j = np.nanmean(S, axis=1) - if ov: - rank_dict[k], rank_dict[k + "_values"] = list_top_genes( - j, eff, None, return_sorted_array=True, **kwargs - ) - else: - rank_dict[k] = list_top_genes(j, eff, None, **kwargs) - rank_dict = pd.DataFrame(data=rank_dict) - elif mode == "int": - ov = kwargs.pop("output_values", False) - for k, S in S_mean.items(): - ints, vals = list_top_interactions(S, eff, reg, **kwargs) - rank_dict[k] = [] - if ov: - rank_dict[k + "_values"] = [] - for ind, int_val in enumerate(ints): - if not (exclude_diagonal and int_val[0] == int_val[1]): - rank_dict[k].append(int_val[0] + " - " + int_val[1]) - if ov: - rank_dict[k + "_values"].append(vals[ind]) - rank_dict = pd.DataFrame(data=rank_dict) - else: - raise ValueError(f"No such mode as {mode}.") - return rank_dict - - -# --------------------------------------------------------------------------------------------------- -# aggregate regulators or targets -def aggregateRegEffs( - adata, - data_dict=None, - reg_dict=None, - eff_dict=None, - key="jacobian", - basis="pca", - store_in_adata=True, -): - """Aggregate multiple genes' Jacobian or sensitivity. - - Parameters - ---------- - adata: :class:`~anndata.AnnData` - AnnData object that contains the reconstructed vector field in `.uns`. - data_dict: `dict` - A dictionary corresponds to the Jacobian or sensitivity information, must be calculated with either: - `dyn.vf.jacobian(adata, basis='pca', regulators=genes, effectors=genes)` or - `dyn.vf.sensitivity(adata, basis='pca', regulators=genes, effectors=genes)` - reg_dict: `dict` - A dictionary in which keys correspond to regulator-groups (i.e. TFs for specific cell type) while values - a list of genes that must have at least one overlapped genes with that from the Jacobian or sensitivity - dict. - eff_dict: `dict` - A dictionary in which keys correspond to effector-groups (i.e. markers for specific cell type) while values - a list of genes that must have at least one overlapped genes with that from the Jacobian or sensitivity - dict. - key: `str` - The key in .uns that corresponds to the Jacobian or sensitivity matrix information. - basis: str (default: 'pca') - The embedding data in which the vector field was reconstructed. If `None`, use the vector field function - that was reconstructed directly from the original unreduced gene expression space. - store_in_adata: bool (default: `True`) - Whether to store the divergence result in adata. - - - Returns - ------- - adata: :class:`~anndata.AnnData` - Depending on `store_in_adata`, it will either return a dictionary that include the aggregated Jacobian or - sensitivity information or the updated AnnData object that is updated with the `'aggregation'` key in the - `.uns`. This dictionary contains a 3-dimensional tensor with dimensions n_obs x n_regulators x n_effectors - as well as other information. - """ - - key_ = key if basis is None else key + "_" + basis - data_dict = adata.uns[key_] if data_dict is None else data_dict - - tensor, cell_idx, tensor_gene, regulators_, effectors_ = ( - data_dict.get(key), - data_dict.get("cell_idx"), - data_dict.get(key + "_gene"), - data_dict.get("regulators"), - data_dict.get("effectors"), - ) - - Aggregation = np.zeros((len(eff_dict), len(reg_dict), len(cell_idx))) - reg_ind = 0 - for reg_key, reg_val in reg_dict.items(): - eff_ind = 0 - for eff_key, eff_val in eff_dict.items(): - reg_val, eff_val = ( - list(np.unique(reg_val)) if reg_val is not None else None, - list(np.unique(eff_val)) if eff_val is not None else None, - ) - - Der, source_genes, target_genes = intersect_sources_targets( - reg_val, - regulators_, - eff_val, - effectors_, - tensor if tensor_gene is None else tensor_gene, - ) - if len(source_genes) + len(target_genes) > 0: - Aggregation[eff_ind, reg_ind, :] = Der.sum(axis=(0, 1)) # dim 0: target; dim 1: source - else: - Aggregation[eff_ind, reg_ind, :] = np.nan - eff_ind += 1 - reg_ind += 0 - - ret_dict = {"aggregation": None, "cell_idx": cell_idx} - # use 'str_key' in dict.keys() to check if these items are computed, or use dict.get('str_key') - if Aggregation is not None: - ret_dict["aggregation_gene"] = Aggregation - if reg_dict.keys() is not None: - ret_dict["regulators"] = list(reg_dict.keys()) - if eff_dict.keys() is not None: - ret_dict["effectors"] = list(eff_dict.keys()) - - det = [np.linalg.det(Aggregation[:, :, i]) for i in np.arange(Aggregation.shape[2])] - key = key + "_aggregation" if basis is None else key + "_aggregation_" + basis - adata.obs[key + "_det"] = np.nan - adata.obs[key + "_det"][cell_idx] = det - if store_in_adata: - adata.uns[key] = ret_dict - return adata - else: - return ret_dict