diff --git a/src/metaDMG/fit/bayesian.py b/src/metaDMG/fit/bayesian.py index 41a80b6..d446800 100644 --- a/src/metaDMG/fit/bayesian.py +++ b/src/metaDMG/fit/bayesian.py @@ -29,7 +29,7 @@ #%% -def model_PMD(x, N, k=None): +def numpyro_model(x, N, k=None): x_abs = jnp.abs(x) A = numpyro.sample("A", dist.Beta(A_prior[0], A_prior[1])) @@ -46,18 +46,6 @@ def model_PMD(x, N, k=None): numpyro.sample("obs", dist.BetaBinomial(alpha, beta, N), obs=k) -def model_null(x, N, k=None): - c = numpyro.sample("c", dist.Beta(c_prior[0], c_prior[1])) - Dx = numpyro.deterministic("Dx", c) - delta = numpyro.sample("delta", dist.Exponential(1 / phi_prior[1])) - phi = numpyro.deterministic("phi", delta + phi_prior[0]) - - alpha = numpyro.deterministic("alpha", Dx * phi) - beta = numpyro.deterministic("beta", (1 - Dx) * phi) - - numpyro.sample("obs", dist.BetaBinomial(alpha, beta, N), obs=k) - - #%% @@ -65,36 +53,16 @@ def filter_out_k(data): return {key: value for key, value in data.items() if key != "k"} -def is_model_PMD(model): - name = model.__name__.lower() - if "pmd" in name: - return True - elif "null" in name: - return False - raise AssertionError(f"Model should be PMD or null, got {model}") - - -#%% - - -@jit -def _get_posterior_PMD(rng_key, samples, *args, **kwargs): - return Predictive(model_PMD, samples)(rng_key, *args, **kwargs) - - @jit -def _get_posterior_null(rng_key, samples, *args, **kwargs): - return Predictive(model_null, samples)(rng_key, *args, **kwargs) +def _get_posterior(rng_key, samples, *args, **kwargs): + return Predictive(numpyro_model, samples)(rng_key, *args, **kwargs) def get_posterior_predictive(mcmc, data): posterior_samples = mcmc.get_samples() rng_key = Key(0) data_no_k = filter_out_k(data) - if is_model_PMD(mcmc.sampler.model): - return _get_posterior_PMD(rng_key, posterior_samples, **data_no_k) - else: - return _get_posterior_null(rng_key, posterior_samples, **data_no_k) + return _get_posterior(rng_key, posterior_samples, **data_no_k) def get_posterior_predictive_obs(mcmc, data): @@ -111,98 +79,46 @@ def get_n_sigma_probability(n_sigma): CONF_1_SIGMA = get_n_sigma_probability(1) -# def compute_posterior( -# mcmc, data, func_avg=np.mean, func_dispersion=lambda x: np.std(x, axis=0) -# ): -# """func = central tendency function, e.g. np.mean or np.median""" -# posterior_predictive = get_posterior_predictive(mcmc, data) -# predictions_fraction = posterior_predictive["obs"] / data["N"] -# y_average = func_avg(predictions_fraction, axis=0) -# # y_dispersion = numpyro.diagnostics.hpdi(predictions_fraction, prob=0.68) -# y_dispersion = func_dispersion(predictions_fraction) -# return y_average, y_dispersion - - -def add_D_max_information(fit_result, mcmc, data): - # posterior = get_posterior_predictive(mcmc, data) - # c = mcmc.get_samples()["c"] - # f = posterior["obs"] / data["N"] - # f = f[:, 0] - # D_max_samples = f - c - # D_max_mu = np.mean(D_max_samples).item() - # D_max_std = np.std(D_max_samples).item() - - # New method, more similar to frequentist and better when few reads +def add_D_information( + fit_result, + mcmc, + data, + prefix="", +): + A = mcmc.get_samples()["A"] - # c = mcmc.get_samples()["c"] phi = mcmc.get_samples()["phi"] N = max(data["N"][0], 1) mu = np.mean(A) std = np.mean(np.sqrt(A * (1 - A) * (phi + N) / ((phi + 1) * N))) - # mu.item(), std.item() Dx = A alpha = Dx * phi beta = (1 - Dx) * phi - # pdf = sp_betabinom(N, alpha, beta) - # 1000x faster approximation for ppf + # 1000x faster approximation for sp_betabinom(N, alpha, beta) pdf_approx = sp_betabinom(N, alpha.mean(), beta.mean()) - # pdf_beta = sp_beta(alpha, beta) + fit_result[f"{prefix}D"] = mu.item() + fit_result[f"{prefix}D_std"] = std.item() + fit_result[f"{prefix}significance"] = mu.item() / std.item() + fit_result[f"{prefix}D_median"] = np.median(A) - # pdf.pmf(0).mean() - # pdf_approx.pmf(0) # 3 times faster for pmf - - prefix = "Bayesian_" - fit_result[f"{prefix}D_max"] = mu.item() - fit_result[f"{prefix}D_max_std"] = std.item() - fit_result[f"{prefix}D_max_median"] = np.median(A) - # fit_result[f"{prefix}D_max_median"] = pdf_approx.median().mean() / N - - # fit_result[f"{prefix}prob_lt_5p_damage"] = pdf_beta.cdf(0.05).mean() - # # fit_result[f"{prefix}prob_lt_5p_damage_betabinom"] = pdf.cdf(0.05 * N).mean() - # fit_result[f"{prefix}prob_lt_2p_damage"] = pdf_beta.cdf(0.02).mean() - # # fit_result[f"{prefix}prob_lt_2p_damage_betabinom"] = pdf.cdf(0.02 * N).mean() # fit_result[f"{prefix}prob_lt_1p_damage"] = pdf_beta.cdf(0.01).mean() # # fit_result[f"{prefix}prob_lt_1p_damage_betabinom"] = pdf.cdf(0.01 * N).mean() - # fit_result[f"{prefix}prob_lt_0.1p_damage"] = pdf_beta.cdf(0.001).mean() - # # fit_result[f"{prefix}prob_lt_0.1p_damage_betabinom"] = pdf.cdf(0.001 * N).mean() # fit_result[f"{prefix}prob_zero_damage"] = pdf.cdf(0).mean() - # fit_result[f"{prefix}D_max_median"] = pdf.median().mean() / N - for n_sigma in [1, 2, 3]: conf = get_n_sigma_probability(n_sigma) - fit_result[f"{prefix}D_max_confidence_interval_{n_sigma}_sigma_low"] = ( + fit_result[f"{prefix}D_CI_{n_sigma}_sigma_low"] = ( pdf_approx.ppf((1 - conf) / 2.0).mean() / N ) - fit_result[f"{prefix}D_max_confidence_interval_{n_sigma}_sigma_high"] = ( + fit_result[f"{prefix}D_CI_{n_sigma}_sigma_high"] = ( pdf_approx.ppf((1 + conf) / 2.0).mean() / N ) - fit_result[f"{prefix}D_max_confidence_interval_95_low"] = ( - pdf_approx.ppf((1 - 0.95) / 2.0).mean() / N - ) - fit_result[f"{prefix}D_max_confidence_interval_95_high"] = ( - pdf_approx.ppf((1 + 0.95) / 2.0).mean() / N - ) - - # fit_result["Bayesian_D_max"] = D_max["mu"] - # fit_result["Bayesian_D_max_std"] = D_max["std"] - # fit_result["Bayesian_D_max_median"] = D_max["median"] - # fit_result["Bayesian_D_max_confidence_interval_1_sigma_low"] = D_max[ - # "confidence_interval_1_sigma_low" - # ] - # fit_result["Bayesian_D_max_confidence_interval_1_sigma_high"] = D_max[ - # "confidence_interval_1_sigma_high" - # ] - # fit_result["Bayesian_D_max_confidence_interval_95_low"] = D_max[ - # "confidence_interval_95_low" - # ] - # fit_result["Bayesian_D_max_confidence_interval_95_high"] = D_max[ - # "confidence_interval_95_high" - # ] + fit_result[f"{prefix}D_CI_95_low"] = pdf_approx.ppf((1 - 0.95) / 2.0).mean() / N + fit_result[f"{prefix}D_CI_95_high"] = pdf_approx.ppf((1 + 0.95) / 2.0).mean() / N return fit_result @@ -211,72 +127,23 @@ def add_D_max_information(fit_result, mcmc, data): @jit -def _compute_log_likelihood_PMD(posterior_samples, data): - return log_likelihood(model_PMD, posterior_samples, **data)["obs"] - - -@jit -def _compute_log_likelihood_null(posterior_samples, data): - return log_likelihood(model_null, posterior_samples, **data)["obs"] +def _compute_log_likelihood(posterior_samples, data): + return log_likelihood(numpyro_model, posterior_samples, **data)["obs"] def compute_log_likelihood(mcmc, data): posterior_samples = mcmc.get_samples() - if is_model_PMD(mcmc.sampler.model): - return _compute_log_likelihood_PMD(posterior_samples, data) - else: - return _compute_log_likelihood_null(posterior_samples, data) - - -#%% - - -def get_lppd_and_waic(mcmc, data): - d_results = {} - # get the log likehood for each (point, num_samples) - logprob = np.asarray(compute_log_likelihood(mcmc, data)) - # lppd for each observation - lppd_i = logsumexp(logprob, 0) - np.log(logprob.shape[0]) - d_results["lppd_i"] = lppd_i - # lppd - lppd = lppd_i.sum() - d_results["lppd"] = lppd - # waic penalty for each observation - pWAIC_i = np.var(logprob, 0) - d_results["pWAIC_i"] = pWAIC_i - # waic penalty # the effective number of parameters penalty - pWAIC = pWAIC_i.sum() - d_results["pWAIC"] = pWAIC - # waic for each observation - waic_i = -2 * (lppd_i - pWAIC_i) - d_results["waic_i"] = waic_i - # waic # prediction of out-of-sample deviance - waic = waic_i.sum() - d_results["waic"] = waic - # standard error of waic - # waic_vec = -2 * (lppd_i - pWAIC_i) - # waic_uncertainty = jnp.sqrt(logprob.shape[1] * jnp.var(waic_vec)) - return d_results + return _compute_log_likelihood(posterior_samples, data) #%% -# def get_mean_of_variable(mcmc, variable, axis=0): -# values = mcmc.get_samples()[variable] -# return np.mean(values, axis=axis).item() - - -# def get_std_of_variable(mcmc, variable, axis=0): -# values = mcmc.get_samples()[variable] -# return np.std(values, axis=axis).item() - - def add_summary_of_variable( fit_result, mcmc, variable, - prefix="Bayesian_", + prefix="", axis=0, ): values = np.array(mcmc.get_samples()[variable]) @@ -288,12 +155,8 @@ def add_summary_of_variable( fit_result[f"{s}"] = np.mean(values, axis=axis) fit_result[f"{s}_std"] = np.std(values, axis=axis) fit_result[f"{s}_median"] = np.median(values, axis=axis) - fit_result[f"{s}_confidence_interval_1_sigma_low"] = np.quantile( - values, q_low, axis=axis - ) - fit_result[f"{s}_confidence_interval_1_sigma_high"] = np.quantile( - values, q_high, axis=axis - ) + fit_result[f"{s}_CI_1_sigma_low"] = np.quantile(values, q_low, axis=axis) + fit_result[f"{s}_CI_1_sigma_high"] = np.quantile(values, q_high, axis=axis) def compute_rho_Ac(mcmc): @@ -302,48 +165,10 @@ def compute_rho_Ac(mcmc): return np.corrcoef(A, c)[0, 1] -@njit -def compute_dse(waic_i_x, waic_i_y): - N = len(waic_i_x) - return np.sqrt(N * np.var(waic_i_x - waic_i_y)) - - -@njit -def compute_z_from_waic_is(waic_i_x, waic_i_y): - dse = compute_dse(waic_i_x, waic_i_y) - d_waic = waic_i_y.sum() - waic_i_x.sum() - z = d_waic / dse - return z - - -def compute_z(d_results_PMD, d_results_null): - z = compute_z_from_waic_is(d_results_PMD["waic_i"], d_results_null["waic_i"]) - return z - - -@njit -def compute_z_jackknife_error_from_waic_is(waic_i_x, waic_i_y): - N = len(waic_i_x) - all_ids = np.arange(N) - zs = np.empty(N) - for i in range(N): - mask = all_ids != i - zs[i] = compute_z_from_waic_is(waic_i_x[mask], waic_i_y[mask]) - z_jack_std = np.std(zs) - return z_jack_std - - -def compute_z_jackknife_error(d_results_PMD, d_results_null): - return compute_z_jackknife_error_from_waic_is( - d_results_PMD["waic_i"], - d_results_null["waic_i"], - ) - - #%% -def init_mcmc(model, **kwargs): +def _init_mcmc(model, **kwargs): mcmc_kwargs = dict( progress_bar=False, @@ -357,25 +182,12 @@ def init_mcmc(model, **kwargs): return MCMC(NUTS(model), jit_model_args=True, **mcmc_kwargs, **kwargs) -def init_mcmc_PMD(**kwargs): - mcmc_PMD = init_mcmc(model_PMD, **kwargs) - return mcmc_PMD - - -def init_mcmc_null(**kwargs): - mcmc_null = init_mcmc(model_null, **kwargs) - return mcmc_null - - -def init_mcmcs(config, **kwargs): +def init_mcmc(config, **kwargs): if config["bayesian"]: - mcmc_PMD = init_mcmc_PMD(**kwargs) - # mcmc_null = init_mcmc_null(**kwargs) + mcmc = _init_mcmc(numpyro_model, **kwargs) else: - mcmc_PMD = None - # mcmc_null = None - return mcmc_PMD - # return mcmc_PMD, mcmc_null + mcmc = None + return mcmc def fit_mcmc(mcmc, data, seed=0): @@ -390,43 +202,31 @@ def use_last_state_as_warmup_state(mcmc): def add_Bayesian_fit_result( fit_result, data, - mcmc_PMD, - # mcmc_null, + mcmc, ): - # d_results_PMD = get_lppd_and_waic(mcmc_PMD, data) - # d_results_null = get_lppd_and_waic(mcmc_null, data) - - add_D_max_information(fit_result, mcmc_PMD, data) + add_D_information(fit_result, mcmc, data) - add_summary_of_variable(fit_result, mcmc_PMD, "A") - add_summary_of_variable(fit_result, mcmc_PMD, "q") - add_summary_of_variable(fit_result, mcmc_PMD, "c") - add_summary_of_variable(fit_result, mcmc_PMD, "phi") + add_summary_of_variable(fit_result, mcmc, "A") + add_summary_of_variable(fit_result, mcmc, "q") + add_summary_of_variable(fit_result, mcmc, "c") + add_summary_of_variable(fit_result, mcmc, "phi") - fit_result["Bayesian_rho_Ac"] = compute_rho_Ac(mcmc_PMD) - - # z = compute_z(d_results_PMD, d_results_null) - # fit_result["Bayesian_z"] = z + fit_result["rho_Ac"] = compute_rho_Ac(mcmc) def make_fits( fit_result, data, - mcmc_PMD, - # mcmc_null, + mcmc, ): - fit_mcmc(mcmc_PMD, data) - # fit_mcmc(mcmc_null, data) # do not use non-PMD model later on + fit_mcmc(mcmc, data) add_Bayesian_fit_result( fit_result, data, - mcmc_PMD, - # mcmc_null, + mcmc, ) - # mcmc_PMD.print_summary(prob=0.68) - # mcmc_null.print_summary(prob=0.68) + # mcmc.print_summary(prob=0.68) # if False: - # use_last_state_as_warmup_state(mcmc_PMD) - # use_last_state_as_warmup_state(mcmc_null) + # use_last_state_as_warmup_state(mcmc) diff --git a/src/metaDMG/fit/fits.py b/src/metaDMG/fit/fits.py index 0c4b62f..1c86f7c 100644 --- a/src/metaDMG/fit/fits.py +++ b/src/metaDMG/fit/fits.py @@ -135,30 +135,13 @@ def add_count_information(fit_result, config, group, data): #%% -# def timer_fit_MAP(config, data): -# # data = group_to_numpyro_data(config, group) -# # %timeit timer_fit_MAP(config, data) -# fit_result = {} -# with warnings.catch_warnings(): -# warnings.filterwarnings("ignore") -# fit_all, fit_forward, fit_reverse = frequentist.make_fits(fit_result, data) - - -# def timer_fit_bayesian(config, data, mcmc_PMD, mcmc_null): -# # data = group_to_numpyro_data(config, group) -# # %timeit timer_fit_bayesian(config, data, mcmc_PMD, mcmc_null) -# fit_result = {} -# bayesian.make_fits(fit_result, data, mcmc_PMD, mcmc_null) - - #%% def fit_single_group( config, group, - mcmc_PMD=None, - # mcmc_null=None, + mcmm=None, ): fit_result = {} @@ -177,26 +160,12 @@ def fit_single_group( logger.warning(s) return None - with warnings.catch_warnings(): - warnings.filterwarnings("ignore") - - frequentist.make_fits( - config, - fit_result, - data, - sample, - tax_id, - ) # fit_all, fit_forward, fit_reverse - - add_count_information(fit_result, config, group, data) - - if mcmc_PMD is not None: # and mcmc_null is not None: + if mcmm is not None: try: bayesian.make_fits( fit_result, data, - mcmc_PMD, - # mcmc_null, + mcmm, ) except: from metaDMG.fit.serial import _setup_logger @@ -206,14 +175,26 @@ def fit_single_group( s += "Skipping the Bayesian fit." logger.warning(s) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + + frequentist.make_fits( + config, + fit_result, + data, + sample, + tax_id, + ) # fit + + add_count_information(fit_result, config, group, data) + return fit_result def compute_fits_seriel(config, df_mismatches, with_progressbar=False): # Do not initialise MCMC if config["bayesian"] is False - # mcmc_PMD, mcmc_null = bayesian.init_mcmcs(config) - mcmc_PMD = bayesian.init_mcmcs(config) + mcmm = bayesian.init_mcmc(config) groupby = get_groupby(df_mismatches) @@ -230,7 +211,7 @@ def compute_fits_seriel(config, df_mismatches, with_progressbar=False): res = fit_single_group( config, group, - mcmc_PMD, + mcmm, # mcmc_null, ) @@ -654,6 +635,7 @@ def compute(config, df_mismatches): df_fit_results = pd.merge(df_fit_results, df_stat_cut, on="tax_id") + prefix = "" if config["bayesian"] else "MAP_" cols_ordered = [ "sample", "tax_id", @@ -661,8 +643,8 @@ def compute(config, df_mismatches): "tax_rank", "N_reads", "N_alignments", - "D_max", - "significance", + f"{prefix}D", + f"{prefix}significance", "mean_L", "std_L", "mean_GC", diff --git a/src/metaDMG/fit/frequentist.py b/src/metaDMG/fit/frequentist.py index b718ab7..469889e 100644 --- a/src/metaDMG/fit/frequentist.py +++ b/src/metaDMG/fit/frequentist.py @@ -22,7 +22,7 @@ @njit -def log_likelihood_PMD(A, q, c, phi, x, k, N): +def compute_log_likelihood(A, q, c, phi, x, k, N): Dx = A * (1 - q) ** (np.abs(x) - 1) + c alpha = Dx * phi beta = (1 - Dx) * phi @@ -30,7 +30,7 @@ def log_likelihood_PMD(A, q, c, phi, x, k, N): @njit -def log_prior_PMD(A, q, c, phi): +def compute_log_prior(A, q, c, phi): lp = ( fit_utils.log_beta(A, *A_prior) + fit_utils.log_beta(q, *q_prior) @@ -41,18 +41,24 @@ def log_prior_PMD(A, q, c, phi): @njit -def log_posterior_PMD(A, q, c, phi, x, k, N): - log_likelihood = log_likelihood_PMD(A=A, q=q, c=c, phi=phi, x=x, k=k, N=N) - log_p = log_prior_PMD(A=A, q=q, c=c, phi=phi) +def compute_log_posterior(A, q, c, phi, x, k, N): + log_likelihood = compute_log_likelihood(A=A, q=q, c=c, phi=phi, x=x, k=k, N=N) + log_p = compute_log_prior(A=A, q=q, c=c, phi=phi) return log_likelihood + log_p #%% -class FrequentistPMD: +class Frequentist: def __init__( - self, data, sample, tax_id, method="posterior", p0=None, verbose=False + self, + data, + sample, + tax_id, + method="posterior", + p0=None, + verbose=False, ): self.sample = sample self.tax_id = tax_id @@ -76,7 +82,7 @@ def __str__(self): s = f"sample = {self.sample}, tax_id = {self.tax_id} \n" s += f"A = {self.A:.3f}, q = {self.q:.3f}," s += f"c = {self.c:.5f}, phi = {self.phi:.1f} \n" - s += f"D_max = {self.D_max:.3f} +/- {self.D_max_std:.3f} \n" + s += f"D = {self.D:.3f} +/- {self.D_std:.3f} \n" s += f"significance = {self.significance:.3f} \n" s += f"rho_Ac = {self.rho_Ac:.3f} \n" s += f"log_likelihood = {self.log_likelihood:.3f} \n" @@ -87,22 +93,22 @@ def __str__(self): def __call__(self, A, q, c, phi): if self.method == "likelihood": - return self.log_likelihood_PMD( + return self.compute_log_likelihood( A=A, q=q, c=c, phi=phi, ) elif self.method == "posterior": - return self.log_posterior_PMD( + return self.compute_log_posterior( A=A, q=q, c=c, phi=phi, ) - def log_likelihood_PMD(self, A, q, c, phi): - return log_likelihood_PMD( + def compute_log_likelihood(self, A, q, c, phi): + return compute_log_likelihood( A=A, q=q, c=c, @@ -112,8 +118,8 @@ def log_likelihood_PMD(self, A, q, c, phi): N=self.N, ) - def log_posterior_PMD(self, A, q, c, phi): - return log_posterior_PMD( + def compute_log_posterior(self, A, q, c, phi): + return compute_log_posterior( A=A, q=q, c=c, @@ -130,19 +136,19 @@ def _setup_p0(self, p0): self.p0 = p0 self.param_grid = { - "A": sp_beta(*A_prior), # mean = 0.2, shape = 4 - "q": sp_beta(*q_prior), # mean = 0.2, shape = 4 - "c": sp_beta(*c_prior), # mean = 0.1, shape = 10 + "A": sp_beta(*A_prior), + "q": sp_beta(*q_prior), + "c": sp_beta(*c_prior), "phi": sp_exponential(*phi_prior), } def _setup_minuit(self, m=None): if self.method == "likelihood": - f = self.log_likelihood_PMD + f = self.compute_log_likelihood elif self.method == "posterior": - f = self.log_posterior_PMD + f = self.compute_log_posterior if m is None: self.m = Minuit(f, **self.p0) @@ -231,16 +237,16 @@ def fit(self): self.valid = self.m.valid if self.valid: - self.D_max, self.D_max_std, self.significance = self._get_D_max() + self.D, self.D_std, self.significance = self._get_D() else: - self.D_max, self.D_max_std, self.significance = np.nan, np.nan, np.nan + self.D, self.D_std, self.significance = np.nan, np.nan, np.nan return self @property def log_likelihood(self): if self.valid: - return self.log_likelihood_PMD(*self.m.values) + return self.compute_log_likelihood(*self.m.values) else: return np.nan @@ -303,7 +309,7 @@ def phi(self): def phi_std(self): return self.errors["phi"] - def _get_D_max(self): + def _get_D(self): A = self.A c = self.c @@ -316,22 +322,6 @@ def _get_D_max(self): significance = mu / std return mu, std, significance - # Dx_x1 = A - # alpha = Dx_x1 * phi - # beta = (1 - Dx_x1) * phi - - # dist = sp_betabinom(n=self.N[0], a=alpha, b=beta) - - # # mu = dist.mean() / fit.N[0] - c - # mu = A - # if self.N[0] != 0: - # std = np.sqrt(dist.var()) / self.N[0] - # else: - # std = np.nan - - # self.D_max = mu # A - # self.D_max_std = std - @property def correlation(self): return self.m.covariance.correlation() @@ -385,248 +375,6 @@ def chi2(self): #%% -# @njit -# def f_fit_null(c, phi, k, N): -# alpha = c * phi -# beta = (1 - c) * phi -# return -fit_utils.log_betabinom_null(k=k, N=N, alpha=alpha, beta=beta).sum() - - -@njit -def log_likelihood_null(c, phi, x, k, N): - alpha = c * phi - beta = (1 - c) * phi - return -fit_utils.log_betabinom_null(k=k, N=N, alpha=alpha, beta=beta).sum() - - -@njit -def log_prior_null(c, phi): - lp = fit_utils.log_beta(c, *c_prior) + fit_utils.log_exponential(phi, *phi_prior) - return -lp - - -@njit -def log_posterior_null(c, phi, x, k, N): - log_likelihood = log_likelihood_null(c=c, phi=phi, x=x, k=k, N=N) - log_p = log_prior_null(c=c, phi=phi) - return log_likelihood + log_p - - -class FrequentistNull: - def __init__(self, data, sample, tax_id, method="posterior"): - self.sample = sample - self.tax_id = tax_id - self.x = data["x"] - self.k = data["k"] - self.N = data["N"] - self.method = method - self._setup_minuit() - - def __repr__(self): - s = f"Frequentist(data, method={self.method}). \n\n" - s += self.__str__() - return s - - def __str__(self): - s = f"sample = {self.sample}, tax_id = {self.tax_id} \n" - s += f"c = {self.c:.5f}, phi = {self.phi:.1f} \n" - s += f"log_likelihood = {self.log_likelihood:.3f} \n" - return s - - def __call__(self, A, q, c, phi): - if self.method == "likelihood": - return self.log_likelihood_null(A=A, q=q, c=c, phi=phi) - elif self.method == "posterior": - return self.log_posterior_null(A=A, q=q, c=c, phi=phi) - - def log_likelihood_null(self, c, phi): - return log_likelihood_null(c=c, phi=phi, x=self.x, k=self.k, N=self.N) - - def log_posterior_null(self, c, phi): - return log_posterior_null(c=c, phi=phi, x=self.x, k=self.k, N=self.N) - - # def __call__(self, c, phi): - # return f_fit_null(c, phi, self.k, self.N) - - def _setup_minuit(self): - - if self.method == "likelihood": - f = self.log_likelihood_null - - elif self.method == "posterior": - f = self.log_posterior_null - - self.m = Minuit(f, c=0.1, phi=100) - - if self.method == "likelihood": - self.m.limits["c"] = (0, 1) - - elif self.method == "posterior": - eps = 1e-10 - self.m.limits["c"] = (0 + eps, 1 - eps) - - self.m.limits["phi"] = (2, None) - self.m.errordef = Minuit.LIKELIHOOD - - def fit(self): - self.m.migrad() - return self - - def migrad(self): - self.m.migrad() - return self - - def minos(self): - self.m.minos() - return self - - @property - def log_likelihood(self): - return self.log_likelihood_null(*self.m.values) - - @property - def c(self): - return self.values["c"] - - @property - def phi(self): - return self.values["phi"] - - @property - def values(self): - return self.m.values.to_dict() - - @property - def errors(self): - return self.m.errors.to_dict() - - -#%% - - -class Frequentist: - def __init__(self, data, sample, tax_id, method="posterior", p0=None): - self.PMD = FrequentistPMD(data, sample, tax_id, method=method, p0=p0).fit() - self.null = FrequentistNull(data, sample, tax_id, method=method).fit() - self.lambda_LR = fit_utils.compute_likelihood_ratio( - self.PMD, - self.null, - only_LR=True, - ) - - self.valid = self.PMD.valid - - self.data = data - self.sample = sample - self.tax_id = tax_id - self.x = data["x"] - self.k = data["k"] - self.N = data["N"] - self.method = method - - def __repr__(self): - s = f"Frequentist(data, method={self.method}). \n\n" - s += self.__str__() - return s - - def __str__(self): - s = f"sample = {self.sample}, tax_id = {self.tax_id} \n" - s += f"A = {self.A:.3f}, q = {self.q:.3f}, " - s += f"c = {self.c:.5f}, phi = {self.phi:.1f} \n" - s += f"D_max = {self.D_max:.3f} +/- {self.D_max_std:.3f}, significance = {self.significance:.3f} \n" - s += f"rho_Ac = {self.rho_Ac:.3f} \n" - s += f"log_likelihood_PMD = {self.PMD.log_likelihood:.3f} \n" - s += f"log_likelihood_null = {self.null.log_likelihood:.3f} \n" - s += ( - f"lambda_LR = {self.lambda_LR:.3f} \n" - # f"lambda_LR as prob = {self.lambda_LR_P:.4%}, " - # f"lambda_LR as z = {self.lambda_LR_z:.3f} \n" - ) - s += f"valid = {self.valid}" - return s - - @property - def D_max(self): - return self.PMD.D_max - - @property - def D_max_std(self): - return self.PMD.D_max_std - - @property - def A(self): - return self.PMD.A - - @property - def A_std(self): - return self.PMD.A_std - - @property - def q(self): - return self.PMD.q - - @property - def q_std(self): - return self.PMD.q_std - - @property - def c(self): - return self.PMD.c - - @property - def c_std(self): - return self.PMD.c_std - - @property - def phi(self): - return self.PMD.phi - - @property - def phi_std(self): - return self.PMD.phi_std - - @property - def rho_Ac(self): - return self.PMD.rho_Ac - - @property - def chi2(self): - return self.PMD.chi2 - - @property - def PMD_values(self): - return self.PMD.values - - @property - def significance(self): - return self.PMD.significance - - -#%% - - -def compute_LR(fit1, fit2): - return -2 * (fit1.log_likelihood - fit2.log_likelihood) - - -def compute_LR_All(fit_all): - return compute_LR(fit_all.PMD, fit_all.null) - - -def compute_LR_ForRev(fit_forward, fit_reverse): - LR_forward = compute_LR(fit_forward.PMD, fit_forward.null) - LR_reverse = compute_LR(fit_reverse.PMD, fit_reverse.null) - return LR_forward + LR_reverse - - -def compute_LR_ForRev_All(fit_all, fit_forward, fit_reverse): - log_lik_ForRev = fit_forward.PMD.log_likelihood + fit_reverse.PMD.log_likelihood - return -2 * (log_lik_ForRev - fit_all.PMD.log_likelihood) - - -#%% - - def make_fits( config, fit_result, @@ -638,50 +386,23 @@ def make_fits( ): np.random.seed(42) - forward_only = config["forward_only"] if forward_only is None else forward_only + if forward_only is None: + forward_only = config["forward_only"] if forward_only: data = {key: val[data["x"] > 0] for key, val in data.items()} - # fit_forward = Frequentist( - # data_forward, - # sample, - # tax_id, - # method=method, - # ) - - # else: - # fit_all = Frequentist(data, sample, tax_id, method=method) fit = Frequentist( data, sample, tax_id, method=method, - ) - - # data_forward = {key: val[data["x"] > 0] for key, val in data.items()} - # data_reverse = {key: val[data["x"] < 0] for key, val in data.items()} - - # fit_forward = Frequentist( - # data_forward, - # sample, - # tax_id, - # method=method, - # p0=fit_all.PMD_values, - # ) - # fit_reverse = Frequentist( - # data_reverse, - # sample, - # tax_id, - # method=method, - # p0=fit_all.PMD_values, - # ) + ).fit() vars_to_keep = [ - "D_max", - "D_max_std", + "D", + "D_std", "significance", - # "lambda_LR", "q", "q_std", "phi", @@ -695,98 +416,6 @@ def make_fits( ] for var in vars_to_keep: - fit_result[f"{var}"] = getattr(fit, var) + fit_result[f"MAP_{var}"] = getattr(fit, var) return fit - - # if forward_only: - - # duplicate entries for forward only - - # for var in vars_to_keep: - # fit_result[f"{var}"] = getattr(fit_forward, var) - - # for var in vars_to_keep: - # fit_result[f"forward_{var}"] = getattr(fit_forward, var) - - # for var in vars_to_keep: - # fit_result[f"reverse_{var}"] = np.nan - # fit_result[f"reverse_valid"] = False - - # fit_result["asymmetry"] = np.nan - # fit_result["LR_All"] = compute_LR_All(fit_forward) - # fit_result["LR_ForRev"] = np.nan - # fit_result["LR_ForRev_All"] = np.nan - - # fit_result["chi2_all"] = fit_forward.chi2 - # fit_result["chi2_forward"] = fit_result["chi2_all"] - # fit_result["chi2_reverse"] = np.nan - # fit_result["chi2_ForRev"] = np.nan - # return fit_forward - - # for var in vars_to_keep: - # fit_result[f"{var}"] = getattr(fit_all, var) - - # numerator = fit_forward.D_max - fit_reverse.D_max - # denominator = np.sqrt(fit_forward.D_max_std**2 + fit_reverse.D_max_std**2) - # fit_result["asymmetry"] = np.abs(numerator) / denominator - - # for var in vars_to_keep: - # fit_result[f"forward_{var}"] = getattr(fit_forward, var) - - # for var in vars_to_keep: - # fit_result[f"reverse_{var}"] = getattr(fit_reverse, var) - - # fit_result["LR_All"] = compute_LR_All(fit_all) - # fit_result["LR_ForRev"] = compute_LR_ForRev(fit_forward, fit_reverse) - # fit_result["LR_ForRev_All"] = compute_LR_ForRev_All( - # fit_all, fit_forward, fit_reverse - # ) - - # fit_result["chi2_all"] = fit_all.chi2 - # fit_result["chi2_forward"] = fit_forward.chi2 - # fit_result["chi2_reverse"] = fit_reverse.chi2 - # fit_result["chi2_ForRev"] = fit_forward.chi2 + fit_reverse.chi2 - - # return fit_all # , fit_forward, fit_reverse - - -# def make_forward_fits(fit_result, data, sample, tax_id): -# np.random.seed(42) - -# fit_all = Frequentist(data, sample, tax_id, method="posterior") - -# vars_to_keep = [ -# "D_max", -# "D_max_std", -# "significance", -# "lambda_LR", -# "q", -# "q_std", -# "phi", -# "phi_std", -# "A", -# "A_std", -# "c", -# "c_std", -# "rho_Ac", -# # "lambda_LR_P", -# # "lambda_LR_z", -# "valid", -# ] - -# for var in vars_to_keep: -# fit_result[f"{var}"] = getattr(fit_all, var) - -# fit_result["asymmetry"] = np.nan -# fit_result["LR_All"] = compute_LR_All(fit_all) -# fit_result["chi2_all"] = fit_all.chi2 - -# return fit_all - - -# def make_fits(config, fit_result, data, sample, tax_id): -# if config["forward_only"]: -# return make_forward_fits(fit_result, data, sample, tax_id) -# else: -# return make_forward_reverse_fits(fit_result, data, sample, tax_id) diff --git a/src/metaDMG/fit/results.py b/src/metaDMG/fit/results.py index 7264792..03eb275 100644 --- a/src/metaDMG/fit/results.py +++ b/src/metaDMG/fit/results.py @@ -107,6 +107,7 @@ def merge( df_mismatches_wide = compute_df_mismatches_wide(df_mismatches) df_results = pd.merge(df_fit_results, df_mismatches_wide, on=["tax_id"]) + prefix = "" if config["bayesian"] else "MAP_" columns_order = [ "sample", "tax_id", @@ -115,16 +116,16 @@ def merge( "N_reads", "N_alignments", # - "D_max", - "significance", + f"{prefix}D", + f"{prefix}significance", "mean_L", "mean_GC", - "q", - "A", - "c", - "phi", - "rho_Ac", - "valid", + f"{prefix}A", + f"{prefix}q", + f"{prefix}phi", + f"{prefix}c", + f"{prefix}rho_Ac", + "MAP_valid", # "asymmetry", ] diff --git a/src/metaDMG/fit/serial.py b/src/metaDMG/fit/serial.py index 0bf437e..057aab4 100644 --- a/src/metaDMG/fit/serial.py +++ b/src/metaDMG/fit/serial.py @@ -427,7 +427,7 @@ def get_df_fit_results( return df_fit_results # if df_fit_results has already been run with Bayesian, return this - if dataframe_columns_contains(df_fit_results, "Bayesian"): + if dataframe_columns_contains(df_fit_results, "D"): logger.info(f"Loading df_fit_results (Bayesian).") return df_fit_results