Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distribution Comparison Function #202

Closed
spsanderson opened this issue Jun 7, 2022 · 0 comments
Closed

Distribution Comparison Function #202

spsanderson opened this issue Jun 7, 2022 · 0 comments
Assignees
Labels
enhancement New feature or request

Comments

@spsanderson
Copy link
Owner

Compare a set of data over different distributions to find the best possible fit based on absolute sum total difference in generated data after finding best possible parameter estimates.

Function:

tidy_distribution_comparison <- function(.x, .distribution_type = "continuous"){
  
  # Tidyeval ----
  x_term <- as.numeric(.x)
  n <- length(x_term)
  dist_type <- as.character(.distribution_type)
  
  if (!dist_type %in% c("continuous","discrete")){
    rlang::abort(
      message = "The '.distribution_type' parameter must be either 'continuous' 
      or 'discrete'.",
      use_cli_format = TRUE
    )
  }
  
  # Get parameter estimates for distributions
  if (dist_type == "continuous"){
    b <- util_beta_param_estimate(x_term)$parameter_tbl %>%
      dplyr::filter(method == "NIST_MME") %>%
      dplyr::select(dist_type, shape1, shape2) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    c <- util_cauchy_param_estimate(x_term)$parameter_tbl %>%
      dplyr::select(dist_type, location, scale) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    e <- util_exponential_param_estimate(x_term)$parameter_tbl %>%
      dplyr::select(dist_type, rate) %>%
      dplyr::mutate(param_2 = NA) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    g <- util_gamma_param_estimate(x_term)$parameter_tbl %>%
      dplyr::filter(method == "NIST_MME") %>%
      dplyr::select(dist_type, shape, scale) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    l <- util_logistic_param_estimate(x_term)$parameter_tbl %>%
      dplyr::filter(method == "EnvStats_MME") %>%
      dplyr::select(dist_type, location, scale) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    ln <- util_lognormal_param_estimate(x_term)$parameter_tbl %>%
      dplyr::filter(method == "EnvStats_MME") %>%
      dplyr::select(dist_type, mean_log, sd_log) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    p <- util_pareto_param_estimate(x_term)$parameter_tbl %>%
      dplyr::filter(method == "MLE") %>%
      dplyr::select(dist_type, shape, scale) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    u <- util_uniform_param_estimate(x_term)$parameter_tbl %>%
      dplyr::filter(method == "NIST_MME") %>%
      dplyr::select(dist_type, min_est, max_est) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    w <- util_weibull_param_estimate(x_term)$parameter_tbl %>%
      dplyr::select(dist_type, shape, scale) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    comp_tbl <- tidy_combine_distributions(
      tidy_empirical(x_term, .distribution_type = dist_type),
      tidy_beta(.n = n, .shape1 = round(b[[2]], 2), .shape2 = round(b[[3]], 2)),
      tidy_cauchy(.n = n, .location = round(c[[2]], 2), .scale = round(c[[3]], 2)),
      tidy_exponential(.n = n, .rate = round(e[[2]], 2)),
      tidy_gamma(.n = n, .shape = round(g[[2]], 2),  .scale = round(g[[3]], 2)),
      tidy_logistic(.n = n, .location = round(l[[2]], 2), .scale = round(l[[3]], 2)),
      tidy_lognormal(.n = n, .meanlog = round(ln[[2]], 2), .sdlog = round(ln[[3]], 2)),
      tidy_pareto(.n = n, .shape = round(p[[2]], 2), .scale = round(p[[3]], 2)),
      tidy_uniform(.n = n, .min = round(u[[2]], 2), .max = round(u[[3]], 2)),
      tidy_weibull(.n = n, .shape = round(w[[2]], 2), .scale = round(w[[3]], 2))
    )
    
  } else {
    bn <- util_binomial_param_estimate(trunc(tidy_scale_zero_one_vec(x_term)))$parameter_tbl %>%
      dplyr::select(dist_type, size, prob) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    ge <- util_geometric_param_estimate(x_term)$parameter_tbl %>%
      dplyr::filter(method == "EnvStats_MME") %>%
      dplyr::select(dist_type, shape) %>%
      dplyr::mutate(param_2 = NA) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    h <- util_hypergeometric_param_estimate(.x = x_term, .total = n, .k = n)$parameter_tbl %>%
      tidyr::drop_na() %>%
      dplyr::slice(1) %>%
      dplyr::select(dist_type, m, total) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    po <- util_poisson_param_estimate(x_term)$parameter_tbl %>%
      dplyr::select(dist_type, lambda) %>%
      dplyr::mutate(param_2 = NA) %>%
      purrr::set_names("dist_type", "param_1", "param_2")
    
    comp_tbl <- tidy_combine_distributions(
      tidy_empirical(.x = x_term, .distribution_type = dist_type),
      tidy_binomial(.n = n, .size = round(bn[[2]], 2), .prob = round(bn[[3]], 2)),
      tidy_geometric(.n = n, .prob = round(ge[[2]], 2)),
      tidy_hypergeometric(
        .n = n, 
        .m = trunc(h[[2]]), 
        .nn = n - trunc(h[[2]]), 
        .k = trunc(h[[2]])
      ),
      tidy_poisson(.n = n, .lambda = round(po[[2]], 2))
    )
      
  }
  
  # Deviance calculations ----
  deviance_tbl <- comp_tbl %>%
    dplyr::select(dist_type, x, y) %>% 
    dplyr::group_by(dist_type, x) %>%
    dplyr::mutate(y = stats::median(y)) %>%
    dplyr::ungroup() %>%
    dplyr::group_by(dist_type) %>%
    dplyr::mutate(y = tidy_scale_zero_one_vec(y)) %>%
    dplyr::ungroup() %>%
    tidyr::pivot_wider(
      id_cols = x,
      names_from = dist_type,
      values_from = y
    ) %>%
    dplyr::select(x, Empirical, dplyr::everything()) %>%
    dplyr::mutate(
      dplyr::across(
        .cols = -c(x, Empirical),
        .fns = ~ Empirical - .
      )
    ) %>%
    tidyr::drop_na() %>%
    tidyr::pivot_longer(
      cols = -x
    )
  
  total_deviance_tbl <- deviance_tbl %>%
    dplyr::filter(!name == "Empirical") %>%
    dplyr::group_by(name) %>%
    dplyr::summarise(total_deviance = sum(value)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(total_deviance = abs(total_deviance)) %>%
    dplyr::arrange(abs(total_deviance)) %>%
    dplyr::rename(dist_with_params = name, 
                  abs_tot_deviance = total_deviance)
  
  # Return ----
  attr(deviance_tbl, ".tibble_type") <- "deviance_comparison_tbl"
  attr(total_deviance_tbl, ".tibble_type") <- "deviance_results_tbl"
  
  output <- list(
    comp_tbl, 
    deviance_tbl,
    total_deviance_tbl
  )
  
  print(total_deviance_tbl)
  
}

Example:

x_term <- mtcars$mpg

> tidy_distribution_comparison(x_term)
For the beta distribution, its mean 'mu' should be 0 < mu < 1. The data will therefore be
scaled to enforce this.
# A tibble: 9 x 2
  dist_with_params        abs_tot_deviance
  <chr>                              <dbl>
1 Gamma c(11.13, 1.77)               0.414
2 Weibull c(3.54, 21.86)             0.631
3 Beta c(1.09, 1.5, 0)               1.31 
4 Uniform c(31.19, 8.19)             2.42 
5 Lognormal c(2.94, 0.3)             4.04 
6 Exponential c(0.05)                4.41 
7 Pareto c(10, 1.58)                 6.43 
8 Logistic c(19.69, 3.25)            6.96 
9 Cauchy c(19, 7)                   12.5 

x_term <- trunc(mtcars$mpg)

> tidy_distribution_comparison(x_term, "discrete")
# A tibble: 4 x 2
  dist_with_params             abs_tot_deviance
  <chr>                                   <dbl>
1 Poisson c(19.69)                        0.188
2 Hypergeometric c(21, 11, 21)            2.52 
3 Geometric c(0.05)                       2.60 
4 Binomial c(32, 0.03)                    3.14 
@spsanderson spsanderson added the enhancement New feature or request label Jun 7, 2022
@spsanderson spsanderson added this to the TidyDensity v1.2.0 milestone Jun 7, 2022
@spsanderson spsanderson self-assigned this Jun 7, 2022
@spsanderson spsanderson moved this from Todo to In Progress in @spsanderson's Repository Issue Overview Jun 7, 2022
Repository owner moved this from In Progress to Done in @spsanderson's Repository Issue Overview Jun 7, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Development

No branches or pull requests

1 participant