-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make tidy_distribution_comparison()
more robust
#212
Comments
Function: #' Compare Empirical Data to Distributions
#'
#' @family Empirical
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details The purpose of this function is to take some data set provided and
#' to try to find a distribution that may fit the best. A parameter of
#' `.distribution_type` must be set to either `continuous` or `discrete` in order
#' for this the function to try the appropriate types of distributions.
#'
#' The following distributions are used:
#'
#' Continuous:
#' - tidy_beta
#' - tidy_cauchy
#' - tidy_exponential
#' - tidy_gamma
#' - tidy_logistic
#' - tidy_lognormal
#' - tidy_pareto
#' - tidy_uniform
#' - tidy_weibull
#'
#' Discrete:
#' - tidy_binomial
#' - tidy_geometric
#' - tidy_hypergeometric
#' - tidy_poisson
#'
#'
#' @description Compare some empirical data set against different distributions
#' to help find the distribution that could be the best fit.
#'
#' @param .x The data set being passed to the function
#' @param .distribution_type What kind of data is it, can be one of `continuous`
#' or `discrete`
#'
#' @examples
#' xc <- mtcars$mpg
#' tidy_distribution_comparison(xc, "continuous")
#'
#' xd <- trunc(xc)
#' tidy_distribution_comparison(xd, "discrete")
#'
#' @return
#' An invisible list object. A tibble is printed.
#'
#' @export
#'
tidy_distribution_comparison <- function(.x, .distribution_type = "continuous"){
# Tidyeval ----
x_term <- as.numeric(.x)
n <- length(x_term)
dist_type <- tolower(as.character(.distribution_type))
if (!dist_type %in% c("continuous","discrete")){
rlang::abort(
message = "The '.distribution_type' parameter must be either 'continuous'
or 'discrete'.",
use_cli_format = TRUE
)
}
# Get parameter estimates for distributions
if (dist_type == "continuous"){
b <- try(util_beta_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, shape1, shape2) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(b, "try-error")){
tb <- tidy_beta(.n = n, .shape1 = round(b[[2]], 2), .shape2 = round(b[[3]], 2))
}
c <- try(util_cauchy_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, location, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(c, "try-error")){
tc <- tidy_cauchy(.n = n, .location = round(c[[2]], 2), .scale = round(c[[3]], 2))
}
e <- try(util_exponential_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, rate) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(e, "try-error")){
te <- tidy_exponential(.n = n, .rate = round(e[[2]], 2))
}
g <- try(util_gamma_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(g, "try-error")){
tg <- tidy_gamma(.n = n, .shape = round(g[[2]], 2), .scale = round(g[[3]], 2))
}
l <- try(util_logistic_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, location, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(l, "try-error")){
tl <- tidy_logistic(.n = n, .location = round(l[[2]], 2), .scale = round(l[[3]], 2))
}
ln <- try(util_lognormal_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, mean_log, sd_log) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(ln, "try-error")){
tln <- tidy_lognormal(.n = n, .meanlog = round(ln[[2]], 2), .sdlog = round(ln[[3]], 2))
}
p <- try(util_pareto_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "MLE") %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(p, "try-error")){
tp <- tidy_pareto(.n = n, .shape = round(p[[2]], 2), .scale = round(p[[3]], 2))
}
u <- try(util_uniform_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, min_est, max_est) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(u, "try-error")){
tu <- tidy_uniform(.n = n, .min = round(u[[2]], 2), .max = round(u[[3]], 2))
}
w <- try(util_weibull_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(w, "try-error")){
tw <- tidy_weibull(.n = n, .shape = round(w[[2]], 2), .scale = round(w[[3]], 2))
}
comp_tbl <- tidy_combine_distributions(
tidy_empirical(x_term, .distribution_type = dist_type),
if (exists("tb") && nrow(tb) > 0){tb},
if (exists("tc") && nrow(tb) > 0){tc},
if (exists("te") && nrow(tb) > 0){te},
if (exists("tg") && nrow(tb) > 0){tg},
if (exists("tl") && nrow(tb) > 0){tl},
if (exists("tln") && nrow(tb) > 0){tln},
if (exists("tp") && nrow(tb) > 0){tp},
if (exists("tu") && nrow(tb) > 0){tu},
if (exists("tw") && nrow(tb) > 0){tw}
)
# comp_tbl <- tidy_combine_distributions(
# tidy_empirical(x_term, .distribution_type = dist_type),
# tidy_beta(.n = n, .shape1 = round(b[[2]], 2), .shape2 = round(b[[3]], 2)),
# tidy_cauchy(.n = n, .location = round(c[[2]], 2), .scale = round(c[[3]], 2)),
# tidy_exponential(.n = n, .rate = round(e[[2]], 2)),
# tidy_gamma(.n = n, .shape = round(g[[2]], 2), .scale = round(g[[3]], 2)),
# tidy_logistic(.n = n, .location = round(l[[2]], 2), .scale = round(l[[3]], 2)),
# tidy_lognormal(.n = n, .meanlog = round(ln[[2]], 2), .sdlog = round(ln[[3]], 2)),
# tidy_pareto(.n = n, .shape = round(p[[2]], 2), .scale = round(p[[3]], 2)),
# tidy_uniform(.n = n, .min = round(u[[2]], 2), .max = round(u[[3]], 2)),
# tidy_weibull(.n = n, .shape = round(w[[2]], 2), .scale = round(w[[3]], 2))
# )
} else {
bn <- try(util_binomial_param_estimate(trunc(tidy_scale_zero_one_vec(x_term)))$parameter_tbl %>%
dplyr::select(dist_type, size, prob) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(bn, "try-error")){
tb <- tidy_binomial(.n = n, .size = round(bn[[2]], 2), .prob = round(bn[[3]], 2))
}
ge <- try(util_geometric_param_estimate(trunc(x_term))$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, shape) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(ge, "try-error")){
tg <- tidy_geometric(.n = n, .prob = round(ge[[2]], 2))
}
h <- try(util_hypergeometric_param_estimate(.x = trunc(x_term), .total = n, .k = n)$parameter_tbl %>%
tidyr::drop_na() %>%
dplyr::slice(1) %>%
dplyr::select(dist_type, m, total) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(h, "try-error")){
th <- tidy_hypergeometric(
.n = n,
.m = trunc(h[[2]]),
.nn = n - trunc(h[[2]]),
.k = trunc(h[[2]])
)
}
po <- try(util_poisson_param_estimate(trunc(x_term))$parameter_tbl %>%
dplyr::select(dist_type, lambda) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(po, "try-error")){
tp <- tidy_poisson(.n = n, .lambda = round(po[[2]], 2))
}
comp_tbl <- tidy_combine_distributions(
tidy_empirical(.x = x_term, .distribution_type = dist_type),
if (exists("tb") && nrow(tb) > 0){tb},
if (exists("tg") && nrow(tb) > 0){tg},
if (exists("th") && nrow(tb) > 0){th},
if (exists("tp") && nrow(tb) > 0){tp}
)
# comp_tbl <- tidy_combine_distributions(
# tidy_empirical(.x = x_term, .distribution_type = dist_type),
# tidy_binomial(.n = n, .size = round(bn[[2]], 2), .prob = round(bn[[3]], 2)),
# tidy_geometric(.n = n, .prob = round(ge[[2]], 2)),
# tidy_hypergeometric(
# .n = n,
# .m = trunc(h[[2]]),
# .nn = n - trunc(h[[2]]),
# .k = trunc(h[[2]])
# ),
# tidy_poisson(.n = n, .lambda = round(po[[2]], 2))
# )
}
# Deviance calculations ----
deviance_tbl <- comp_tbl %>%
dplyr::select(dist_type, x, y) %>%
dplyr::group_by(dist_type, x) %>%
dplyr::mutate(y = stats::median(y)) %>%
dplyr::ungroup() %>%
dplyr::group_by(dist_type) %>%
dplyr::mutate(y = tidy_scale_zero_one_vec(y)) %>%
dplyr::ungroup() %>%
tidyr::pivot_wider(
id_cols = x,
names_from = dist_type,
values_from = y
) %>%
dplyr::select(x, Empirical, dplyr::everything()) %>%
dplyr::mutate(
dplyr::across(
.cols = -c(x, Empirical),
.fns = ~ Empirical - .
)
) %>%
tidyr::drop_na() %>%
tidyr::pivot_longer(
cols = -x
)
total_deviance_tbl <- deviance_tbl %>%
dplyr::filter(!name == "Empirical") %>%
dplyr::group_by(name) %>%
dplyr::summarise(total_deviance = sum(value)) %>%
dplyr::ungroup() %>%
dplyr::mutate(total_deviance = abs(total_deviance)) %>%
dplyr::arrange(abs(total_deviance)) %>%
dplyr::rename(dist_with_params = name,
abs_tot_deviance = total_deviance)
# Return ----
attr(deviance_tbl, ".tibble_type") <- "deviance_comparison_tbl"
attr(total_deviance_tbl, ".tibble_type") <- "deviance_results_tbl"
output <- list(
comparison_tbl = comp_tbl,
deviance_tbl = deviance_tbl,
total_deviance_tbl = total_deviance_tbl
)
print(total_deviance_tbl)
return(invisible(output))
} Example: > tidy_distribution_comparison(trunc(mtcars$mpg), "discrete")
# A tibble: 4 × 2
dist_with_params abs_tot_deviance
<chr> <dbl>
1 Poisson c(19.69) 1.52
2 Hypergeometric c(21, 11, 21) 3.19
3 Geometric c(0.05) 4.37
4 Binomial c(32, 0.03) 6.48
> tidy_distribution_comparison(mtcars$mpg, "continuous")
For the beta distribution, its mean 'mu' should be 0 < mu < 1. The data will
therefore be scaled to enforce this.
# A tibble: 9 × 2
dist_with_params abs_tot_deviance
<chr> <dbl>
1 Weibull c(3.58, 22.29) 0.678
2 Exponential c(0.05) 1.06
3 Uniform c(31.84, 8.34) 1.60
4 Beta c(1.11, 1.58, 0) 2.18
5 Lognormal c(2.96, 0.29) 2.66
6 Gamma c(11.47, 1.75) 3.23
7 Logistic c(20.09, 3.27) 3.54
8 Pareto c(10.4, 1.62) 6.06
9 Cauchy c(19.2, 7.38) 10.4
> tidy_distribution_comparison(hai_scale_zscore_vec(mtcars$mpg), "continuous")
For the beta distribution, its mean 'mu' should be 0 < mu < 1. The data will
therefore be scaled to enforce this.
Error in util_gamma_param_estimate(x_term) :
The numeric vector '.x' must contain at least two unique values greater
than 0
Error in util_lognormal_param_estimate(x_term) :
'.x' must contain at least two non-missing distict values. All non-missing
values must be positive.
Error in util_pareto_param_estimate(x_term) :
'.x' must contain at least two non-missing distinct values. All values of
'.x' must be positive.
Error in survival::survreg(x_surv ~ 1, dist = "weibull") :
Invalid survival times for this distribution
In addition: Warning messages:
1: In log(dlist$dtrans(Y[exactsurv, 1])) : NaNs produced
2: In log(y) : NaNs produced
# A tibble: 5 × 2
dist_with_params abs_tot_deviance
<chr> <dbl>
1 Beta c(1.11, 1.58, 0) 0.752
2 Cauchy c(-0.15, 1.22) 1.53
3 Uniform c(1.95, -1.95) 3.67
4 Exponential c(14060018348863988) 3.86
5 Logistic c(0, 0.54) 4.22 |
Repository owner
moved this from In Progress
to Done
in @spsanderson's Repository Issue Overview
Jun 13, 2022
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Make sure the function does not fail out if data provided does not fit the parameters of what is needed to estimate a particular distributions parameters.
The text was updated successfully, but these errors were encountered: