From be20230da84e381cfab0d86537fb493b226d3675 Mon Sep 17 00:00:00 2001 From: Christian Michelsen Date: Mon, 10 Oct 2022 17:07:23 +0200 Subject: [PATCH] feat: Add significance as main fit quality --- src/metaDMG/fit/bayesian.py | 121 +++++++++++++++++++++------------ src/metaDMG/fit/fit_utils.py | 5 +- src/metaDMG/fit/fits.py | 7 +- src/metaDMG/fit/frequentist.py | 44 +++++++----- src/metaDMG/fit/results.py | 4 +- 5 files changed, 116 insertions(+), 65 deletions(-) diff --git a/src/metaDMG/fit/bayesian.py b/src/metaDMG/fit/bayesian.py index d73880f..ca21da1 100644 --- a/src/metaDMG/fit/bayesian.py +++ b/src/metaDMG/fit/bayesian.py @@ -9,6 +9,7 @@ from numpyro import distributions as dist from numpyro.infer import MCMC, NUTS, Predictive, log_likelihood from scipy.special import logsumexp +from scipy.stats import beta as sp_beta from scipy.stats import betabinom as sp_betabinom from scipy.stats import norm as sp_norm @@ -110,19 +111,19 @@ def get_n_sigma_probability(n_sigma): CONF_1_SIGMA = get_n_sigma_probability(1) -def compute_posterior( - mcmc, data, func_avg=np.mean, func_dispersion=lambda x: np.std(x, axis=0) -): - """func = central tendency function, e.g. np.mean or np.median""" - posterior_predictive = get_posterior_predictive(mcmc, data) - predictions_fraction = posterior_predictive["obs"] / data["N"] - y_average = func_avg(predictions_fraction, axis=0) - # y_dispersion = numpyro.diagnostics.hpdi(predictions_fraction, prob=0.68) - y_dispersion = func_dispersion(predictions_fraction) - return y_average, y_dispersion +# def compute_posterior( +# mcmc, data, func_avg=np.mean, func_dispersion=lambda x: np.std(x, axis=0) +# ): +# """func = central tendency function, e.g. np.mean or np.median""" +# posterior_predictive = get_posterior_predictive(mcmc, data) +# predictions_fraction = posterior_predictive["obs"] / data["N"] +# y_average = func_avg(predictions_fraction, axis=0) +# # y_dispersion = numpyro.diagnostics.hpdi(predictions_fraction, prob=0.68) +# y_dispersion = func_dispersion(predictions_fraction) +# return y_average, y_dispersion -def compute_D_max(mcmc, data): +def add_D_max_information(fit_result, mcmc, data): # posterior = get_posterior_predictive(mcmc, data) # c = mcmc.get_samples()["c"] # f = posterior["obs"] / data["N"] @@ -133,7 +134,7 @@ def compute_D_max(mcmc, data): # New method, more similar to frequentist and better when few reads A = mcmc.get_samples()["A"] - c = mcmc.get_samples()["c"] + # c = mcmc.get_samples()["c"] phi = mcmc.get_samples()["phi"] N = max(data["N"][0], 1) mu = np.mean(A) @@ -144,18 +145,65 @@ def compute_D_max(mcmc, data): alpha = Dx * phi beta = (1 - Dx) * phi - # pdf = sp_betabinom(N, alpha, beta) - pdf = sp_betabinom(N, alpha.mean(), beta.mean()) # 1000x faster approximation - return { - "mu": mu.item(), - "std": std.item(), - "median": pdf.median().mean() / N, - "confidence_interval_1_sigma_low": pdf.ppf((1 - CONF_1_SIGMA) / 2.0).mean() / N, - "confidence_interval_1_sigma_high": pdf.ppf((1 + CONF_1_SIGMA) / 2.0).mean() - / N, - "confidence_interval_95_low": pdf.ppf((1 - 0.95) / 2.0).mean() / N, - "confidence_interval_95_high": pdf.ppf((1 + 0.95) / 2.0).mean() / N, - } + pdf = sp_betabinom(N, alpha, beta) + # 1000x faster approximation for ppf + pdf_approx = sp_betabinom(N, alpha.mean(), beta.mean()) + + pdf_beta = sp_beta(alpha, beta) + + # pdf.pmf(0).mean() + # pdf_approx.pmf(0) # 3 times faster for pmf + + prefix = "Bayesian_" + fit_result[f"{prefix}D_max"] = mu.item() + fit_result[f"{prefix}D_max_std"] = std.item() + + fit_result[f"{prefix}prob_lt_5p_damage"] = pdf_beta.cdf(0.05).mean() + # fit_result[f"{prefix}prob_lt_5p_damage_betabinom"] = pdf.cdf(0.05 * N).mean() + fit_result[f"{prefix}prob_lt_2p_damage"] = pdf_beta.cdf(0.02).mean() + # fit_result[f"{prefix}prob_lt_2p_damage_betabinom"] = pdf.cdf(0.02 * N).mean() + fit_result[f"{prefix}prob_lt_1p_damage"] = pdf_beta.cdf(0.01).mean() + # fit_result[f"{prefix}prob_lt_1p_damage_betabinom"] = pdf.cdf(0.01 * N).mean() + fit_result[f"{prefix}prob_lt_0.1p_damage"] = pdf_beta.cdf(0.001).mean() + # fit_result[f"{prefix}prob_lt_0.1p_damage_betabinom"] = pdf.cdf(0.001 * N).mean() + + fit_result[f"{prefix}prob_zero_damage"] = pdf.cdf(0).mean() + + fit_result[f"{prefix}D_max_median"] = pdf.median().mean() / N + + for n_sigma in [1, 2, 3]: + conf = get_n_sigma_probability(n_sigma) + fit_result[f"{prefix}D_max_confidence_interval_{n_sigma}_sigma_low"] = ( + pdf_approx.ppf((1 - conf) / 2.0).mean() / N + ) + fit_result[f"{prefix}D_max_confidence_interval_{n_sigma}_sigma_high"] = ( + pdf_approx.ppf((1 + conf) / 2.0).mean() / N + ) + + fit_result[f"{prefix}D_max_confidence_interval_95_low"] = ( + pdf_approx.ppf((1 - 0.95) / 2.0).mean() / N + ) + fit_result[f"{prefix}D_max_confidence_interval_95_high"] = ( + pdf_approx.ppf((1 + 0.95) / 2.0).mean() / N + ) + + # fit_result["Bayesian_D_max"] = D_max["mu"] + # fit_result["Bayesian_D_max_std"] = D_max["std"] + # fit_result["Bayesian_D_max_median"] = D_max["median"] + # fit_result["Bayesian_D_max_confidence_interval_1_sigma_low"] = D_max[ + # "confidence_interval_1_sigma_low" + # ] + # fit_result["Bayesian_D_max_confidence_interval_1_sigma_high"] = D_max[ + # "confidence_interval_1_sigma_high" + # ] + # fit_result["Bayesian_D_max_confidence_interval_95_low"] = D_max[ + # "confidence_interval_95_low" + # ] + # fit_result["Bayesian_D_max_confidence_interval_95_high"] = D_max[ + # "confidence_interval_95_high" + # ] + + return fit_result #%% @@ -347,25 +395,7 @@ def add_Bayesian_fit_result( d_results_PMD = get_lppd_and_waic(mcmc_PMD, data) d_results_null = get_lppd_and_waic(mcmc_null, data) - z = compute_z(d_results_PMD, d_results_null) - fit_result["Bayesian_z"] = z - - D_max = compute_D_max(mcmc_PMD, data) - fit_result["Bayesian_D_max"] = D_max["mu"] - fit_result["Bayesian_D_max_std"] = D_max["std"] - fit_result["Bayesian_D_max_median"] = D_max["median"] - fit_result["Bayesian_D_max_confidence_interval_1_sigma_low"] = D_max[ - "confidence_interval_1_sigma_low" - ] - fit_result["Bayesian_D_max_confidence_interval_1_sigma_high"] = D_max[ - "confidence_interval_1_sigma_high" - ] - fit_result["Bayesian_D_max_confidence_interval_95_low"] = D_max[ - "confidence_interval_95_low" - ] - fit_result["Bayesian_D_max_confidence_interval_95_high"] = D_max[ - "confidence_interval_95_high" - ] + add_D_max_information(fit_result, mcmc_PMD, data) add_summary_of_variable(fit_result, mcmc_PMD, "A") add_summary_of_variable(fit_result, mcmc_PMD, "q") @@ -374,10 +404,13 @@ def add_Bayesian_fit_result( fit_result["Bayesian_rho_Ac"] = compute_rho_Ac(mcmc_PMD) + z = compute_z(d_results_PMD, d_results_null) + fit_result["Bayesian_z"] = z + def make_fits(fit_result, data, mcmc_PMD, mcmc_null): fit_mcmc(mcmc_PMD, data) - fit_mcmc(mcmc_null, data) + fit_mcmc(mcmc_null, data) # do not use non-PMD model later on add_Bayesian_fit_result(fit_result, data, mcmc_PMD, mcmc_null) # mcmc_PMD.print_summary(prob=0.68) # mcmc_null.print_summary(prob=0.68) diff --git a/src/metaDMG/fit/fit_utils.py b/src/metaDMG/fit/fit_utils.py index 582d494..b33f237 100644 --- a/src/metaDMG/fit/fit_utils.py +++ b/src/metaDMG/fit/fit_utils.py @@ -131,9 +131,12 @@ def z_to_prob(z): return erf(z / np.sqrt(2)) -def compute_likelihood_ratio(frequentist_PMD, frequentist_null): +def compute_likelihood_ratio(frequentist_PMD, frequentist_null, only_LR=False): LR = -2 * (frequentist_PMD.log_likelihood - frequentist_null.log_likelihood) + if only_LR: + return LR + df = len(describe(frequentist_PMD)) - len(describe(frequentist_null)) LR_P = sp_chi2.sf(x=LR, df=df) LR_z = prob_to_z(1 - LR_P) diff --git a/src/metaDMG/fit/fits.py b/src/metaDMG/fit/fits.py index 161d29e..c2087b2 100644 --- a/src/metaDMG/fit/fits.py +++ b/src/metaDMG/fit/fits.py @@ -627,19 +627,22 @@ def compute(config, df_mismatches): df_fit_results = pd.merge(df_fit_results, df_stat_cut, on="tax_id") cols_ordered = [ + "sample", "tax_id", "tax_name", "tax_rank", "N_reads", "N_alignments", - "lambda_LR", "D_max", + "significance", "mean_L", "std_L", "mean_GC", "std_GC", "tax_path", - *df_fit_results.loc[:, "D_max_std":"sample"].columns.drop("tax_id"), + ] + cols_ordered = cols_ordered + [ + c for c in df_fit_results.columns if c not in cols_ordered ] # if local or global damage diff --git a/src/metaDMG/fit/frequentist.py b/src/metaDMG/fit/frequentist.py index 2c9c193..fd35964 100644 --- a/src/metaDMG/fit/frequentist.py +++ b/src/metaDMG/fit/frequentist.py @@ -66,7 +66,7 @@ def __init__( self.is_fitted = False def __repr__(self): - s = f"FrequentistPMD(data, method={self.method}). \n\n" + s = f"FrequentistPMD(data, method={self.method}). \n" if self.is_fitted: s += self.__str__() return s @@ -77,6 +77,7 @@ def __str__(self): s += f"A = {self.A:.3f}, q = {self.q:.3f}," s += f"c = {self.c:.5f}, phi = {self.phi:.1f} \n" s += f"D_max = {self.D_max:.3f} +/- {self.D_max_std:.3f} \n" + s += f"significance = {self.significance:.3f} \n" s += f"rho_Ac = {self.rho_Ac:.3f} \n" s += f"log_likelihood = {self.log_likelihood:.3f} \n" s += f"valid = {self.valid}" @@ -230,9 +231,9 @@ def fit(self): self.valid = self.m.valid if self.valid: - self.D_max, self.D_max_std = self._get_D_max() + self.D_max, self.D_max_std, self.significance = self._get_D_max() else: - self.D_max, self.D_max_std = np.nan, np.nan + self.D_max, self.D_max_std, self.significance = np.nan, np.nan, np.nan return self @@ -314,7 +315,9 @@ def _get_D_max(self): std = np.sqrt(A * (1 - A) * (phi + N) / ((phi + 1) * N)) - return mu, std + significance = mu / std + + return mu, std, significance # Dx_x1 = A # alpha = Dx_x1 * phi @@ -508,8 +511,11 @@ class Frequentist: def __init__(self, data, sample, tax_id, method="posterior", p0=None): self.PMD = FrequentistPMD(data, sample, tax_id, method=method, p0=p0).fit() self.null = FrequentistNull(data, sample, tax_id, method=method).fit() - p = fit_utils.compute_likelihood_ratio(self.PMD, self.null) - self.lambda_LR, self.lambda_LR_P, self.lambda_LR_z = p + self.lambda_LR = fit_utils.compute_likelihood_ratio( + self.PMD, + self.null, + only_LR=True, + ) self.valid = self.PMD.valid @@ -530,14 +536,14 @@ def __str__(self): s = f"sample = {self.sample}, tax_id = {self.tax_id} \n" s += f"A = {self.A:.3f}, q = {self.q:.3f}, " s += f"c = {self.c:.5f}, phi = {self.phi:.1f} \n" - s += f"D_max = {self.D_max:.3f} +/- {self.D_max_std:.3f}, " + s += f"D_max = {self.D_max:.3f} +/- {self.D_max_std:.3f}, significance = {self.significance:.3f} \n" s += f"rho_Ac = {self.rho_Ac:.3f} \n" s += f"log_likelihood_PMD = {self.PMD.log_likelihood:.3f} \n" s += f"log_likelihood_null = {self.null.log_likelihood:.3f} \n" s += ( - f"lambda_LR = {self.lambda_LR:.3f}, " - f"lambda_LR as prob = {self.lambda_LR_P:.4%}, " - f"lambda_LR as z = {self.lambda_LR_z:.3f} \n" + f"lambda_LR = {self.lambda_LR:.3f} \n" + # f"lambda_LR as prob = {self.lambda_LR_P:.4%}, " + # f"lambda_LR as z = {self.lambda_LR_z:.3f} \n" ) s += f"valid = {self.valid}" return s @@ -594,6 +600,10 @@ def chi2(self): def PMD_values(self): return self.PMD.values + @property + def significance(self): + return self.PMD.significance + #%% @@ -644,9 +654,10 @@ def make_forward_reverse_fits(fit_result, data, sample, tax_id): ) vars_to_keep = [ - "lambda_LR", "D_max", "D_max_std", + "significance", + "lambda_LR", "q", "q_std", "phi", @@ -656,8 +667,8 @@ def make_forward_reverse_fits(fit_result, data, sample, tax_id): "c", "c_std", "rho_Ac", - "lambda_LR_P", - "lambda_LR_z", + # "lambda_LR_P", + # "lambda_LR_z", "valid", ] @@ -694,9 +705,10 @@ def make_forward_fits(fit_result, data, sample, tax_id): fit_all = Frequentist(data, sample, tax_id, method="posterior") vars_to_keep = [ - "lambda_LR", "D_max", "D_max_std", + "significance", + "lambda_LR", "q", "q_std", "phi", @@ -706,8 +718,8 @@ def make_forward_fits(fit_result, data, sample, tax_id): "c", "c_std", "rho_Ac", - "lambda_LR_P", - "lambda_LR_z", + # "lambda_LR_P", + # "lambda_LR_z", "valid", ] diff --git a/src/metaDMG/fit/results.py b/src/metaDMG/fit/results.py index cbf6b31..b9df1e0 100644 --- a/src/metaDMG/fit/results.py +++ b/src/metaDMG/fit/results.py @@ -108,15 +108,15 @@ def merge( df_results = pd.merge(df_fit_results, df_mismatches_wide, on=["tax_id"]) columns_order = [ + "sample", "tax_id", "tax_name", "tax_rank", - "sample", "N_reads", "N_alignments", # - "lambda_LR", "D_max", + "significance", "mean_L", "mean_GC", "q",