Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fano #474

Merged
merged 34 commits into from
May 4, 2023
Merged

Fano #474

Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
ee5a45f
Fano
Ukyeon Mar 28, 2023
36de289
use defautl kernel
Ukyeon Mar 29, 2023
a55534b
call direct select_genes_monocle
Ukyeon Apr 1, 2023
dc00cf8
gene selection refactoring
Ukyeon Apr 4, 2023
2d1c727
Fix to gini function a lot faster
Ukyeon Apr 5, 2023
a4ffc8c
fixed a bug in gini and added docstring
Ukyeon Apr 5, 2023
b4f1ca0
Merge branch 'master' into fano
Ukyeon Apr 5, 2023
a4dc648
fix function name get_mean_cv and added test
Ukyeon Apr 6, 2023
7f837a2
docstring
Ukyeon Apr 6, 2023
dfb5866
docstring
Ukyeon Apr 6, 2023
0b31589
fetch the lastest code
Ukyeon Apr 6, 2023
0e08424
add an warning msg for the dispersion mode
Ukyeon Apr 8, 2023
99c3b9a
Merge remote-tracking branch 'refs/remotes/origin/fano' into fano
Ukyeon Apr 8, 2023
89089c6
deprecated functions
LoveLennone Apr 9, 2023
7e659cd
fix multiprocess in sctransform
Ukyeon Apr 12, 2023
c70a450
Merge remote-tracking branch 'refs/remotes/origin/fano' into fano
Ukyeon Apr 12, 2023
7a5c6e2
delete duplicated filtering cells operation
Ukyeon Apr 13, 2023
9272bc8
Add checking of tkey and cell_cycle_score
Ukyeon Apr 14, 2023
e64004b
fix mode in dyn.pl.feature_genes
Ukyeon Apr 14, 2023
dc4c617
Organize logs
Ukyeon Apr 14, 2023
dfcc62d
fix logging messages
LoveLennone Apr 16, 2023
1b92ccf
Merge branch 'master' into fano
Ukyeon Apr 18, 2023
472ef0c
deprecate top_table
Ukyeon Apr 21, 2023
0e6b0c8
Fix issues and reorder the normalization steps
Ukyeon Apr 21, 2023
8fd56dc
Revert back to the previous functions and fix issues
Ukyeon Apr 24, 2023
665f3b7
Refactoring normalize and log1p function
Ukyeon Apr 25, 2023
0cfdfe7
fix for updating value in log functions and naming _Freeman_Tukey
Ukyeon Apr 25, 2023
c36fe03
fix to save the results of fano and to basic filtering for all recipes
Ukyeon Apr 26, 2023
839969f
modularize Gini and dispersion
Sichao25 Apr 28, 2023
3558bda
update sctransform gene selection
Sichao25 Apr 28, 2023
e6e0aae
update helper function usage when getting gini filter
Sichao25 Apr 28, 2023
c21fcf1
increase subset size
Sichao25 May 1, 2023
d2438d3
Merge branch 'master' into fano
Ukyeon May 4, 2023
b37ef81
fix build error in test_normalize
Ukyeon May 4, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dynamo/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from cycler import cycler
from matplotlib import cm, colors, rcParams

from .dynamo_logger import main_info, main_warning
from .dynamo_logger import main_debug, main_info


class DynamoAdataKeyManager:
Expand Down Expand Up @@ -786,5 +786,5 @@ def set_pub_style_mpltex():

# initialize DynamoSaveConfig and DynamoVisConfig mode defaults
DynamoAdataConfig.update_data_store_mode("full")
main_info("setting visualization default mode in dynamo. Your customized matplotlib settings might be overritten.")
main_debug("setting visualization default mode in dynamo. Your customized matplotlib settings might be overwritten.")
DynamoVisConfig.set_default_mode()
12 changes: 6 additions & 6 deletions dynamo/dynamo_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,8 @@ def error(self, message, indent_level=1, *args, **kwargs):

def info_insert_adata(self, key, adata_attr="obsm", indent_level=1, *args, **kwargs):
message = "<insert> %s to %s in AnnData Object." % (key, adata_attr)
message = format_logging_message(message, logging.INFO, indent_level=indent_level)
return self.logger.error(message, *args, **kwargs)
message = format_logging_message(message, logging.DEBUG, indent_level=indent_level)
return self.logger.debug(message, *args, **kwargs)

def info_insert_adata_var(self, key, indent_level=1, *args, **kwargs):
return self.info_insert_adata(self, key, adata_attr="var", indent_level=1, *args, **kwargs)
Expand Down Expand Up @@ -189,18 +189,18 @@ def report_progress(self, percent=None, count=None, total=None, progress_name=""

def finish_progress(self, progress_name="", time_unit="s", indent_level=1):
self.log_time()
self.report_progress(percent=100, progress_name=progress_name)
#self.report_progress(percent=100, progress_name=progress_name)

saved_terminator = self.logger_stream_handler.terminator
self.logger_stream_handler.terminator = ""
self.logger.info("\n")
#self.logger.info("\n")
Ukyeon marked this conversation as resolved.
Show resolved Hide resolved
self.logger_stream_handler.flush()
self.logger_stream_handler.terminator = saved_terminator

if time_unit == "s":
self.info("[%s] finished [%.4fs]" % (progress_name, self.time_passed), indent_level=indent_level)
self.info("[%s] completed [%.4fs]" % (progress_name, self.time_passed), indent_level=indent_level)
elif time_unit == "ms":
self.info("[%s] finished [%.4fms]" % (progress_name, self.time_passed * 1e3), indent_level=indent_level)
self.info("[%s] completed [%.4fms]" % (progress_name, self.time_passed * 1e3), indent_level=indent_level)
else:
raise NotImplementedError
# self.logger.info("|")
Expand Down
18 changes: 9 additions & 9 deletions dynamo/external/sctransform.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
# =================================================================

import os
from multiprocessing import Manager, Pool

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -128,6 +127,8 @@ def sctransform_core(
"""
A re-implementation of SCTransform from the Satija lab.
"""
import multiprocessing

main_info("sctransform adata on layer: %s" % (layer))
X = DKM.select_layer_data(adata, layer).copy()
X = sp.sparse.csr_matrix(X)
Expand All @@ -139,10 +140,8 @@ def sctransform_core(
genes_ix = genes.copy()

X = X[:, genes]
Xraw = X.copy()
gene_names = gene_names[genes]
genes = np.arange(X.shape[1])
genes_cell_count = X.sum(0).A.flatten()

genes_log_gmean = np.log10(gmean(X, axis=0, eps=gmean_eps))

Expand Down Expand Up @@ -188,7 +187,10 @@ def sctransform_core(
bin_ind = np.ceil(np.arange(1, genes_step1.size + 1) / bin_size)
max_bin = max(bin_ind)

ps = Manager().dict()
ps = multiprocessing.Manager().dict()

# create a process context of fork that copy a Python process from an existing process.
ctx = multiprocessing.get_context("fork")

for i in range(1, int(max_bin) + 1):
genes_bin_regress = genes_step1[bin_ind == i]
Expand All @@ -197,7 +199,9 @@ def sctransform_core(
mm = np.vstack((np.ones(data_step1.shape[0]), data_step1["log_umi"].values.flatten())).T

pc_chunksize = umi_bin.shape[1] // os.cpu_count() + 1
pool = Pool(os.cpu_count(), _parallel_init, [genes_bin_regress, umi_bin, gene_names, mm, ps])

pool = ctx.Pool(os.cpu_count(), _parallel_init, [genes_bin_regress, umi_bin, gene_names, mm, ps])

try:
pool.map(_parallel_wrapper, range(umi_bin.shape[1]), chunksize=pc_chunksize)
finally:
Expand Down Expand Up @@ -254,10 +258,6 @@ def sctransform_core(
full_model_pars["theta"] = theta
del full_model_pars["dispersion"]

model_pars_outliers = outliers

regressor_data = np.vstack((np.ones(cell_attrs.shape[0]), cell_attrs["log_umi"].values)).T

d = X.data
x, y = X.nonzero()
mud = np.exp(full_model_pars.values[:, 0][y] + full_model_pars.values[:, 1][y] * cell_attrs["log_umi"].values[x])
Expand Down
61 changes: 16 additions & 45 deletions dynamo/plot/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ..configuration import DynamoAdataKeyManager
from ..dynamo_logger import main_warning
from ..preprocessing import preprocess as pp
from ..preprocessing.preprocess_monocle_utils import top_table
from ..preprocessing.gene_selection import get_prediction_by_svr
from ..preprocessing.utils import detect_experiment_datatype
from ..tools.utils import get_mapper, update_dict
from .utils import save_fig
Expand Down Expand Up @@ -649,10 +649,9 @@ def feature_genes(
save_show_or_return: str = "show",
save_kwargs: dict = {},
):
"""Plot selected feature genes on top of the mean vs. dispersion scatterplot.
"""Plot selected feature genes on top of the mean vs. dispersion scatter plot.

Parameters
----------
Args:
adata: :class:`~anndata.AnnData`
AnnData object
layer: `str` (default: `X`)
Expand All @@ -664,32 +663,22 @@ def feature_genes(
save_show_or_return: {'show', 'save', 'return'} (default: `show`)
Whether to save, show or return the figure.
save_kwargs: `dict` (default: `{}`)
A dictionary that will passed to the save_fig function. By default it is an empty dictionary and the
A dictionary that will be passed to the save_fig function. By default, it is an empty dictionary and the
save_fig function will use the {"path": None, "prefix": 'feature_genes', "dpi": None, "ext": 'pdf',
"transparent": True, "close": True, "verbose": True} as its parameters. Otherwise you can provide a
"transparent": True, "close": True, "verbose": True} as its parameters. Otherwise, you can provide a
dictionary that properly modify those keys according to your needs.

Returns
-------
Returns:
Nothing but plots the selected feature genes via the mean, CV plot.
"""

import matplotlib.pyplot as plt

mode = adata.uns["feature_selection"] if mode is None else mode

layer = DynamoAdataKeyManager.get_available_layer_keys(adata, layer, include_protein=False)[0]

uns_store_key = None
if mode == "dispersion":
uns_store_key = "dispFitInfo" if layer in ["raw", "X"] else layer + "_dispFitInfo"

table = top_table(adata, layer)
x_min, x_max = (
np.nanmin(table["mean_expression"]),
np.nanmax(table["mean_expression"]),
)
elif mode == "SVR":
if "_dispersion" in mode: # "cv_dispersion", "fano_dispersion"
prefix = "" if layer == "X" else layer + "_"
uns_store_key = "velocyto_SVR" if layer == "raw" or layer == "X" else layer + "_velocyto_SVR"

Expand All @@ -709,11 +698,12 @@ def feature_genes(
ordering_genes = adata.var["use_for_pca"] if "use_for_pca" in adata.var.columns else None

mu_linspace = np.linspace(x_min, x_max, num=1000)
fit = (
adata.uns[uns_store_key]["disp_func"](mu_linspace)
if mode == "dispersion"
else adata.uns[uns_store_key]["SVR"](mu_linspace.reshape(-1, 1))
)
if "_dispersion" in mode:
mean = adata.uns[uns_store_key]["mean"]
cv = adata.uns[uns_store_key]["cv"]
svr_gamma = adata.uns[uns_store_key]["svr_gamma"]
fit, _ = get_prediction_by_svr(mean, cv, svr_gamma)
fit = fit(mu_linspace.reshape(-1, 1))

plt.figure(figsize=figsize)
plt.plot(mu_linspace, fit, alpha=0.4, color="r")
Expand All @@ -724,15 +714,7 @@ def feature_genes(
)

valid_disp_table = table.iloc[valid_ind, :]
if mode == "dispersion":
ax = plt.scatter(
valid_disp_table["mean_expression"],
valid_disp_table["dispersion_empirical"],
s=3,
alpha=1,
color="xkcd:red",
)
elif mode == "SVR":
if "_dispersion" in mode:
ax = plt.scatter(
valid_disp_table[prefix + "log_m"],
valid_disp_table[prefix + "log_cv"],
Expand All @@ -743,15 +725,7 @@ def feature_genes(

neg_disp_table = table.iloc[~valid_ind, :]

if mode == "dispersion":
ax = plt.scatter(
neg_disp_table["mean_expression"],
neg_disp_table["dispersion_empirical"],
s=3,
alpha=0.5,
color="xkcd:grey",
)
elif mode == "SVR":
if "_dispersion" in mode:
ax = plt.scatter(
Ukyeon marked this conversation as resolved.
Show resolved Hide resolved
neg_disp_table[prefix + "log_m"],
neg_disp_table[prefix + "log_cv"],
Expand All @@ -760,9 +734,6 @@ def feature_genes(
color="xkcd:grey",
)

# plt.xlim((0, 100))
if mode == "dispersion":
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Mean (log)")
plt.ylabel("Dispersion (log)") if mode == "dispersion" else plt.ylabel("CV (log)")
Expand Down Expand Up @@ -1080,7 +1051,7 @@ def highest_frac_genes(

else:
main_warning(
"%s not in adata.var, ignoring the gene annotation key when plotting",
"%s not in adata.var, ignoring the gene annotation key when plotting" % gene_annotation_key,
indent_level=2,
)

Expand Down
Loading