-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Normal Distribution and AIC in tidy_distribution_comparison #227
Comments
Fixes section where combining distributions for continuous and discrete. New Function: #' Compare Empirical Data to Distributions
#'
#' @family Empirical
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details The purpose of this function is to take some data set provided and
#' to try to find a distribution that may fit the best. A parameter of
#' `.distribution_type` must be set to either `continuous` or `discrete` in order
#' for this the function to try the appropriate types of distributions.
#'
#' The following distributions are used:
#'
#' Continuous:
#' - tidy_beta
#' - tidy_cauchy
#' - tidy_exponential
#' - tidy_gamma
#' - tidy_logistic
#' - tidy_lognormal
#' - tidy_normal
#' - tidy_pareto
#' - tidy_uniform
#' - tidy_weibull
#'
#' Discrete:
#' - tidy_binomial
#' - tidy_geometric
#' - tidy_hypergeometric
#' - tidy_poisson
#'
#'
#' @description Compare some empirical data set against different distributions
#' to help find the distribution that could be the best fit.
#'
#' @param .x The data set being passed to the function
#' @param .distribution_type What kind of data is it, can be one of `continuous`
#' or `discrete`
#' @param .print_aic This is a boolean that is defaulted to TRUE. This will print
#' out the `aic_tbl`. When set to FALSE it will print out the `total_deviance_tbl`
#'
#' @examples
#' xc <- mtcars$mpg
#' tidy_distribution_comparison(xc, "continuous")
#'
#' xd <- trunc(xc)
#' tidy_distribution_comparison(xd, "discrete")
#'
#' @return
#' An invisible list object. A tibble is printed.
#'
#' @export
#'
tidy_distribution_comparison <- function(.x, .distribution_type = "continuous",
.print_aic = TRUE){
# Tidyeval ----
x_term <- as.numeric(.x)
n <- length(x_term)
dist_type <- tolower(as.character(.distribution_type))
print_aic <- as.logical(.print_aic)
if (!dist_type %in% c("continuous","discrete")){
rlang::abort(
message = "The '.distribution_type' parameter must be either 'continuous'
or 'discrete'.",
use_cli_format = TRUE
)
}
if (!is.logical(print_aic)){
rlang::abort(
message = "'.print_aic' must be either TRUE or FALSE."
)
}
# Get parameter estimates for distributions
if (dist_type == "continuous"){
b <- try(util_beta_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, shape1, shape2) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(b, "try-error")){
tb <- tidy_beta(.n = n, .shape1 = round(b[[2]], 2), .shape2 = round(b[[3]], 2))
}
c <- try(util_cauchy_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, location, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(c, "try-error")){
tc <- tidy_cauchy(.n = n, .location = round(c[[2]], 2), .scale = round(c[[3]], 2))
}
e <- try(util_exponential_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, rate) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(e, "try-error")){
te <- tidy_exponential(.n = n, .rate = round(e[[2]], 2))
}
g <- try(util_gamma_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(g, "try-error")){
tg <- tidy_gamma(.n = n, .shape = round(g[[2]], 2), .scale = round(g[[3]], 2))
}
l <- try(util_logistic_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, location, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(l, "try-error")){
tl <- tidy_logistic(.n = n, .location = round(l[[2]], 2), .scale = round(l[[3]], 2))
}
ln <- try(util_lognormal_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, mean_log, sd_log) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(ln, "try-error")){
tln <- tidy_lognormal(.n = n, .meanlog = round(ln[[2]], 2), .sdlog = round(ln[[3]], 2))
}
p <- try(util_pareto_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "MLE") %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(p, "try-error")){
tp <- tidy_pareto(.n = n, .shape = round(p[[2]], 2), .scale = round(p[[3]], 2))
}
u <- try(util_uniform_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, min_est, max_est) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(u, "try-error")){
tu <- tidy_uniform(.n = n, .min = round(u[[2]], 2), .max = round(u[[3]], 2))
}
w <- try(util_weibull_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(w, "try-error")){
tw <- tidy_weibull(.n = n, .shape = round(w[[2]], 2), .scale = round(w[[3]], 2))
}
nn <- try(util_normal_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME_MLE") %>%
dplyr::select(dist_type, mu, stan_dev) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(n, "try-error")){
tn <- tidy_normal(.n = n, .mean = round(nn[[2]], 2), .sd = round(nn[[3]], 2))
}
comp_tbl <- tidy_combine_distributions(
tidy_empirical(x_term, .distribution_type = dist_type),
if (exists("tb") && nrow(tb) > 0){tb},
if (exists("tc") && nrow(tc) > 0){tc},
if (exists("te") && nrow(te) > 0){te},
if (exists("tg") && nrow(tg) > 0){tg},
if (exists("tl") && nrow(tl) > 0){tl},
if (exists("tln") && nrow(tln) > 0){tln},
if (exists("tp") && nrow(tp) > 0){tp},
if (exists("tu") && nrow(tu) > 0){tu},
if (exists("tw") && nrow(tw) > 0){tw},
if (exists("tn") && nrow(tn) > 0){tn}
)
} else {
bn <- try(util_binomial_param_estimate(trunc(tidy_scale_zero_one_vec(x_term)))$parameter_tbl %>%
dplyr::select(dist_type, size, prob) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(bn, "try-error")){
tb <- tidy_binomial(.n = n, .size = round(bn[[2]], 2), .prob = round(bn[[3]], 2))
}
ge <- try(util_geometric_param_estimate(trunc(x_term))$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, shape) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(ge, "try-error")){
tg <- tidy_geometric(.n = n, .prob = round(ge[[2]], 2))
}
h <- try(util_hypergeometric_param_estimate(.x = trunc(x_term), .total = n, .k = n)$parameter_tbl %>%
tidyr::drop_na() %>%
dplyr::slice(1) %>%
dplyr::select(dist_type, m, total) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(h, "try-error")){
th <- tidy_hypergeometric(
.n = n,
.m = trunc(h[[2]]),
.nn = n - trunc(h[[2]]),
.k = trunc(h[[2]])
)
}
po <- try(util_poisson_param_estimate(trunc(x_term))$parameter_tbl %>%
dplyr::select(dist_type, lambda) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(po, "try-error")){
tp <- tidy_poisson(.n = n, .lambda = round(po[[2]], 2))
}
comp_tbl <- tidy_combine_distributions(
tidy_empirical(.x = x_term, .distribution_type = dist_type),
if (exists("tb") && nrow(tb) > 0){tb},
if (exists("tg") && nrow(tg) > 0){tg},
if (exists("th") && nrow(th) > 0){th},
if (exists("tp") && nrow(tp) > 0){tp}
)
}
# Deviance calculations ----
deviance_tbl <- comp_tbl %>%
dplyr::select(dist_type, x, y) %>%
dplyr::group_by(dist_type, x) %>%
dplyr::mutate(y = stats::median(y)) %>%
dplyr::ungroup() %>%
dplyr::group_by(dist_type) %>%
dplyr::mutate(y = tidy_scale_zero_one_vec(y)) %>%
dplyr::ungroup() %>%
tidyr::pivot_wider(
id_cols = x,
names_from = dist_type,
values_from = y
) %>%
dplyr::select(x, Empirical, dplyr::everything()) %>%
dplyr::mutate(
dplyr::across(
.cols = -c(x, Empirical),
.fns = ~ Empirical - .
)
) %>%
tidyr::drop_na() %>%
tidyr::pivot_longer(
cols = -x
) %>%
dplyr::select(-x)
total_deviance_tbl <- deviance_tbl %>%
dplyr::filter(!name == "Empirical") %>%
dplyr::group_by(name) %>%
dplyr::summarise(total_deviance = sum(value)) %>%
dplyr::ungroup() %>%
dplyr::mutate(total_deviance = abs(total_deviance)) %>%
dplyr::arrange(abs(total_deviance)) %>%
dplyr::rename(dist_with_params = name,
abs_tot_deviance = total_deviance)
# AIC Data ----
emp_data_tbl <- comp_tbl %>%
dplyr::select(dist_type, x, dy) %>%
dplyr::filter(dist_type == "Empirical")
aic_tbl <- comp_tbl %>%
dplyr::filter(!dist_type == "Empirical") %>%
dplyr::select(dist_type, dy) %>%
tidyr::nest(data = dy) %>%
dplyr::mutate(
lm_model = purrr::map(
data,
function(df) lm(dy ~ emp_data_tbl$dy, data = df)
)
) %>%
dplyr::mutate(aic_value = purrr::map(lm_model, stats::AIC)) %>%
dplyr::mutate(aic_value = unlist(aic_value)) %>%
dplyr::mutate(abs_aic = abs(aic_value)) %>%
dplyr::arrange(abs_aic) %>%
dplyr::select(dist_type, aic_value, abs_aic)
multi_metric_tbl <- total_deviance_tbl %>%
dplyr::mutate(dist_with_params = as.factor(dist_with_params)) %>%
dplyr::inner_join(output$aic_tbl, by = c("dist_with_params"="dist_type"))
# Return ----
output <- list(
comparison_tbl = comp_tbl,
deviance_tbl = deviance_tbl,
total_deviance_tbl = total_deviance_tbl,
aic_tbl = aic_tbl,
multi_metric_tbl = multi_metric_tbl
)
# Attributes ----
attr(deviance_tbl, ".tibble_type") <- "deviance_comparison_tbl"
attr(total_deviance_tbl, ".tibble_type") <- "deviance_results_tbl"
attr(aic_tbl, ".tibble_type") <- "aic_tbl"
attr(comp_tbl, ".tibble_type") <- "comparison_tbl"
attr(output, ".x") <- x_term
attr(output, ".n") <- n
attr(output, ".print_aic") <- .print_aic
# Print to console
if (print_aic){
print(aic_tbl)
} else {
print(total_deviance_tbl)
}
return(invisible(output))
} Simple Example: x <- mtcars$mpg
# Continuous
> tidy_distribution_comparison(x)
For the beta distribution, its mean 'mu' should be 0 < mu < 1. The data will
therefore be scaled to enforce this.
# A tibble: 10 × 3
dist_type aic_value abs_aic
<fct> <dbl> <dbl>
1 Beta c(1.11, 1.58, 0) -1.98 1.98
2 Pareto c(10.4, 1.62) 82.7 82.7
3 Logistic c(20.09, 3.27) -149. 149.
4 Gaussian c(20.09, 5.93) -181. 181.
5 Cauchy c(19.2, 7.38) -193. 193.
6 Weibull c(3.58, 22.29) -207. 207.
7 Lognormal c(2.96, 0.29) -210. 210.
8 Uniform c(8.34, 31.84) -216. 216.
9 Exponential c(0.05) -252. 252.
10 Gamma c(11.47, 1.75) -268. 268.
# Discrete
> tidy_distribution_comparison(trunc(x), "discrete")
# A tibble: 4 × 3
dist_type aic_value abs_aic
<fct> <dbl> <dbl>
1 Binomial c(32, 0.03) -33.3 33.3
2 Hypergeometric c(21, 11, 21) -76.9 76.9
3 Poisson c(19.69) -137. 137.
4 Geometric c(0.05) -240. 240.
> output$multi_metric_tbl
# A tibble: 10 × 4
dist_with_params abs_tot_deviance aic_value abs_aic
<fct> <dbl> <dbl> <dbl>
1 Lognormal c(2.96, 0.29) 0.0804 -225. 225.
2 Beta c(1.11, 1.58, 0) 0.157 5.74 5.74
3 Logistic c(20.09, 3.27) 0.455 -152. 152.
4 Gamma c(11.47, 1.75) 2.73 -213. 213.
5 Uniform c(8.34, 31.84) 3.25 -215. 215.
6 Cauchy c(19.2, 7.38) 4.84 -220. 220.
7 Gaussian c(20.09, 5.93) 5.08 -161. 161.
8 Pareto c(10.4, 1.62) 5.24 95.8 95.8
9 Weibull c(3.58, 22.29) 5.56 -142. 142.
10 Exponential c(0.05) 5.69 -181. 181. |
I'm not sure if I'm using your function wrong, but I'm getting a different AIC from
Regards |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Please add Normal Distribution and AIC in tidy_distribution_comparison.
Per Linkedin post comment
Thanks
Ralph Asher
The text was updated successfully, but these errors were encountered: