Skip to content

Commit

Permalink
feat: Add significance as main fit quality
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristianMichelsen committed Oct 10, 2022
1 parent 5407704 commit be20230
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 65 deletions.
121 changes: 77 additions & 44 deletions src/metaDMG/fit/bayesian.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from numpyro import distributions as dist
from numpyro.infer import MCMC, NUTS, Predictive, log_likelihood
from scipy.special import logsumexp
from scipy.stats import beta as sp_beta
from scipy.stats import betabinom as sp_betabinom
from scipy.stats import norm as sp_norm

Expand Down Expand Up @@ -110,19 +111,19 @@ def get_n_sigma_probability(n_sigma):
CONF_1_SIGMA = get_n_sigma_probability(1)


def compute_posterior(
mcmc, data, func_avg=np.mean, func_dispersion=lambda x: np.std(x, axis=0)
):
"""func = central tendency function, e.g. np.mean or np.median"""
posterior_predictive = get_posterior_predictive(mcmc, data)
predictions_fraction = posterior_predictive["obs"] / data["N"]
y_average = func_avg(predictions_fraction, axis=0)
# y_dispersion = numpyro.diagnostics.hpdi(predictions_fraction, prob=0.68)
y_dispersion = func_dispersion(predictions_fraction)
return y_average, y_dispersion
# def compute_posterior(
# mcmc, data, func_avg=np.mean, func_dispersion=lambda x: np.std(x, axis=0)
# ):
# """func = central tendency function, e.g. np.mean or np.median"""
# posterior_predictive = get_posterior_predictive(mcmc, data)
# predictions_fraction = posterior_predictive["obs"] / data["N"]
# y_average = func_avg(predictions_fraction, axis=0)
# # y_dispersion = numpyro.diagnostics.hpdi(predictions_fraction, prob=0.68)
# y_dispersion = func_dispersion(predictions_fraction)
# return y_average, y_dispersion


def compute_D_max(mcmc, data):
def add_D_max_information(fit_result, mcmc, data):
# posterior = get_posterior_predictive(mcmc, data)
# c = mcmc.get_samples()["c"]
# f = posterior["obs"] / data["N"]
Expand All @@ -133,7 +134,7 @@ def compute_D_max(mcmc, data):

# New method, more similar to frequentist and better when few reads
A = mcmc.get_samples()["A"]
c = mcmc.get_samples()["c"]
# c = mcmc.get_samples()["c"]
phi = mcmc.get_samples()["phi"]
N = max(data["N"][0], 1)
mu = np.mean(A)
Expand All @@ -144,18 +145,65 @@ def compute_D_max(mcmc, data):
alpha = Dx * phi
beta = (1 - Dx) * phi

# pdf = sp_betabinom(N, alpha, beta)
pdf = sp_betabinom(N, alpha.mean(), beta.mean()) # 1000x faster approximation
return {
"mu": mu.item(),
"std": std.item(),
"median": pdf.median().mean() / N,
"confidence_interval_1_sigma_low": pdf.ppf((1 - CONF_1_SIGMA) / 2.0).mean() / N,
"confidence_interval_1_sigma_high": pdf.ppf((1 + CONF_1_SIGMA) / 2.0).mean()
/ N,
"confidence_interval_95_low": pdf.ppf((1 - 0.95) / 2.0).mean() / N,
"confidence_interval_95_high": pdf.ppf((1 + 0.95) / 2.0).mean() / N,
}
pdf = sp_betabinom(N, alpha, beta)
# 1000x faster approximation for ppf
pdf_approx = sp_betabinom(N, alpha.mean(), beta.mean())

pdf_beta = sp_beta(alpha, beta)

# pdf.pmf(0).mean()
# pdf_approx.pmf(0) # 3 times faster for pmf

prefix = "Bayesian_"
fit_result[f"{prefix}D_max"] = mu.item()
fit_result[f"{prefix}D_max_std"] = std.item()

fit_result[f"{prefix}prob_lt_5p_damage"] = pdf_beta.cdf(0.05).mean()
# fit_result[f"{prefix}prob_lt_5p_damage_betabinom"] = pdf.cdf(0.05 * N).mean()
fit_result[f"{prefix}prob_lt_2p_damage"] = pdf_beta.cdf(0.02).mean()
# fit_result[f"{prefix}prob_lt_2p_damage_betabinom"] = pdf.cdf(0.02 * N).mean()
fit_result[f"{prefix}prob_lt_1p_damage"] = pdf_beta.cdf(0.01).mean()
# fit_result[f"{prefix}prob_lt_1p_damage_betabinom"] = pdf.cdf(0.01 * N).mean()
fit_result[f"{prefix}prob_lt_0.1p_damage"] = pdf_beta.cdf(0.001).mean()
# fit_result[f"{prefix}prob_lt_0.1p_damage_betabinom"] = pdf.cdf(0.001 * N).mean()

fit_result[f"{prefix}prob_zero_damage"] = pdf.cdf(0).mean()

fit_result[f"{prefix}D_max_median"] = pdf.median().mean() / N

for n_sigma in [1, 2, 3]:
conf = get_n_sigma_probability(n_sigma)
fit_result[f"{prefix}D_max_confidence_interval_{n_sigma}_sigma_low"] = (
pdf_approx.ppf((1 - conf) / 2.0).mean() / N
)
fit_result[f"{prefix}D_max_confidence_interval_{n_sigma}_sigma_high"] = (
pdf_approx.ppf((1 + conf) / 2.0).mean() / N
)

fit_result[f"{prefix}D_max_confidence_interval_95_low"] = (
pdf_approx.ppf((1 - 0.95) / 2.0).mean() / N
)
fit_result[f"{prefix}D_max_confidence_interval_95_high"] = (
pdf_approx.ppf((1 + 0.95) / 2.0).mean() / N
)

# fit_result["Bayesian_D_max"] = D_max["mu"]
# fit_result["Bayesian_D_max_std"] = D_max["std"]
# fit_result["Bayesian_D_max_median"] = D_max["median"]
# fit_result["Bayesian_D_max_confidence_interval_1_sigma_low"] = D_max[
# "confidence_interval_1_sigma_low"
# ]
# fit_result["Bayesian_D_max_confidence_interval_1_sigma_high"] = D_max[
# "confidence_interval_1_sigma_high"
# ]
# fit_result["Bayesian_D_max_confidence_interval_95_low"] = D_max[
# "confidence_interval_95_low"
# ]
# fit_result["Bayesian_D_max_confidence_interval_95_high"] = D_max[
# "confidence_interval_95_high"
# ]

return fit_result


#%%
Expand Down Expand Up @@ -347,25 +395,7 @@ def add_Bayesian_fit_result(
d_results_PMD = get_lppd_and_waic(mcmc_PMD, data)
d_results_null = get_lppd_and_waic(mcmc_null, data)

z = compute_z(d_results_PMD, d_results_null)
fit_result["Bayesian_z"] = z

D_max = compute_D_max(mcmc_PMD, data)
fit_result["Bayesian_D_max"] = D_max["mu"]
fit_result["Bayesian_D_max_std"] = D_max["std"]
fit_result["Bayesian_D_max_median"] = D_max["median"]
fit_result["Bayesian_D_max_confidence_interval_1_sigma_low"] = D_max[
"confidence_interval_1_sigma_low"
]
fit_result["Bayesian_D_max_confidence_interval_1_sigma_high"] = D_max[
"confidence_interval_1_sigma_high"
]
fit_result["Bayesian_D_max_confidence_interval_95_low"] = D_max[
"confidence_interval_95_low"
]
fit_result["Bayesian_D_max_confidence_interval_95_high"] = D_max[
"confidence_interval_95_high"
]
add_D_max_information(fit_result, mcmc_PMD, data)

add_summary_of_variable(fit_result, mcmc_PMD, "A")
add_summary_of_variable(fit_result, mcmc_PMD, "q")
Expand All @@ -374,10 +404,13 @@ def add_Bayesian_fit_result(

fit_result["Bayesian_rho_Ac"] = compute_rho_Ac(mcmc_PMD)

z = compute_z(d_results_PMD, d_results_null)
fit_result["Bayesian_z"] = z


def make_fits(fit_result, data, mcmc_PMD, mcmc_null):
fit_mcmc(mcmc_PMD, data)
fit_mcmc(mcmc_null, data)
fit_mcmc(mcmc_null, data) # do not use non-PMD model later on
add_Bayesian_fit_result(fit_result, data, mcmc_PMD, mcmc_null)
# mcmc_PMD.print_summary(prob=0.68)
# mcmc_null.print_summary(prob=0.68)
Expand Down
5 changes: 4 additions & 1 deletion src/metaDMG/fit/fit_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,12 @@ def z_to_prob(z):
return erf(z / np.sqrt(2))


def compute_likelihood_ratio(frequentist_PMD, frequentist_null):
def compute_likelihood_ratio(frequentist_PMD, frequentist_null, only_LR=False):
LR = -2 * (frequentist_PMD.log_likelihood - frequentist_null.log_likelihood)

if only_LR:
return LR

df = len(describe(frequentist_PMD)) - len(describe(frequentist_null))
LR_P = sp_chi2.sf(x=LR, df=df)
LR_z = prob_to_z(1 - LR_P)
Expand Down
7 changes: 5 additions & 2 deletions src/metaDMG/fit/fits.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,19 +627,22 @@ def compute(config, df_mismatches):
df_fit_results = pd.merge(df_fit_results, df_stat_cut, on="tax_id")

cols_ordered = [
"sample",
"tax_id",
"tax_name",
"tax_rank",
"N_reads",
"N_alignments",
"lambda_LR",
"D_max",
"significance",
"mean_L",
"std_L",
"mean_GC",
"std_GC",
"tax_path",
*df_fit_results.loc[:, "D_max_std":"sample"].columns.drop("tax_id"),
]
cols_ordered = cols_ordered + [
c for c in df_fit_results.columns if c not in cols_ordered
]

# if local or global damage
Expand Down
44 changes: 28 additions & 16 deletions src/metaDMG/fit/frequentist.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __init__(
self.is_fitted = False

def __repr__(self):
s = f"FrequentistPMD(data, method={self.method}). \n\n"
s = f"FrequentistPMD(data, method={self.method}). \n"
if self.is_fitted:
s += self.__str__()
return s
Expand All @@ -77,6 +77,7 @@ def __str__(self):
s += f"A = {self.A:.3f}, q = {self.q:.3f},"
s += f"c = {self.c:.5f}, phi = {self.phi:.1f} \n"
s += f"D_max = {self.D_max:.3f} +/- {self.D_max_std:.3f} \n"
s += f"significance = {self.significance:.3f} \n"
s += f"rho_Ac = {self.rho_Ac:.3f} \n"
s += f"log_likelihood = {self.log_likelihood:.3f} \n"
s += f"valid = {self.valid}"
Expand Down Expand Up @@ -230,9 +231,9 @@ def fit(self):
self.valid = self.m.valid

if self.valid:
self.D_max, self.D_max_std = self._get_D_max()
self.D_max, self.D_max_std, self.significance = self._get_D_max()
else:
self.D_max, self.D_max_std = np.nan, np.nan
self.D_max, self.D_max_std, self.significance = np.nan, np.nan, np.nan

return self

Expand Down Expand Up @@ -314,7 +315,9 @@ def _get_D_max(self):

std = np.sqrt(A * (1 - A) * (phi + N) / ((phi + 1) * N))

return mu, std
significance = mu / std

return mu, std, significance

# Dx_x1 = A
# alpha = Dx_x1 * phi
Expand Down Expand Up @@ -508,8 +511,11 @@ class Frequentist:
def __init__(self, data, sample, tax_id, method="posterior", p0=None):
self.PMD = FrequentistPMD(data, sample, tax_id, method=method, p0=p0).fit()
self.null = FrequentistNull(data, sample, tax_id, method=method).fit()
p = fit_utils.compute_likelihood_ratio(self.PMD, self.null)
self.lambda_LR, self.lambda_LR_P, self.lambda_LR_z = p
self.lambda_LR = fit_utils.compute_likelihood_ratio(
self.PMD,
self.null,
only_LR=True,
)

self.valid = self.PMD.valid

Expand All @@ -530,14 +536,14 @@ def __str__(self):
s = f"sample = {self.sample}, tax_id = {self.tax_id} \n"
s += f"A = {self.A:.3f}, q = {self.q:.3f}, "
s += f"c = {self.c:.5f}, phi = {self.phi:.1f} \n"
s += f"D_max = {self.D_max:.3f} +/- {self.D_max_std:.3f}, "
s += f"D_max = {self.D_max:.3f} +/- {self.D_max_std:.3f}, significance = {self.significance:.3f} \n"
s += f"rho_Ac = {self.rho_Ac:.3f} \n"
s += f"log_likelihood_PMD = {self.PMD.log_likelihood:.3f} \n"
s += f"log_likelihood_null = {self.null.log_likelihood:.3f} \n"
s += (
f"lambda_LR = {self.lambda_LR:.3f}, "
f"lambda_LR as prob = {self.lambda_LR_P:.4%}, "
f"lambda_LR as z = {self.lambda_LR_z:.3f} \n"
f"lambda_LR = {self.lambda_LR:.3f} \n"
# f"lambda_LR as prob = {self.lambda_LR_P:.4%}, "
# f"lambda_LR as z = {self.lambda_LR_z:.3f} \n"
)
s += f"valid = {self.valid}"
return s
Expand Down Expand Up @@ -594,6 +600,10 @@ def chi2(self):
def PMD_values(self):
return self.PMD.values

@property
def significance(self):
return self.PMD.significance


#%%

Expand Down Expand Up @@ -644,9 +654,10 @@ def make_forward_reverse_fits(fit_result, data, sample, tax_id):
)

vars_to_keep = [
"lambda_LR",
"D_max",
"D_max_std",
"significance",
"lambda_LR",
"q",
"q_std",
"phi",
Expand All @@ -656,8 +667,8 @@ def make_forward_reverse_fits(fit_result, data, sample, tax_id):
"c",
"c_std",
"rho_Ac",
"lambda_LR_P",
"lambda_LR_z",
# "lambda_LR_P",
# "lambda_LR_z",
"valid",
]

Expand Down Expand Up @@ -694,9 +705,10 @@ def make_forward_fits(fit_result, data, sample, tax_id):
fit_all = Frequentist(data, sample, tax_id, method="posterior")

vars_to_keep = [
"lambda_LR",
"D_max",
"D_max_std",
"significance",
"lambda_LR",
"q",
"q_std",
"phi",
Expand All @@ -706,8 +718,8 @@ def make_forward_fits(fit_result, data, sample, tax_id):
"c",
"c_std",
"rho_Ac",
"lambda_LR_P",
"lambda_LR_z",
# "lambda_LR_P",
# "lambda_LR_z",
"valid",
]

Expand Down
4 changes: 2 additions & 2 deletions src/metaDMG/fit/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,15 @@ def merge(
df_results = pd.merge(df_fit_results, df_mismatches_wide, on=["tax_id"])

columns_order = [
"sample",
"tax_id",
"tax_name",
"tax_rank",
"sample",
"N_reads",
"N_alignments",
#
"lambda_LR",
"D_max",
"significance",
"mean_L",
"mean_GC",
"q",
Expand Down

0 comments on commit be20230

Please sign in to comment.