Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zero Truncated Binomial. param_estimate and stats_tbl #469

Closed
Tracked by #467
spsanderson opened this issue May 3, 2024 · 0 comments
Closed
Tracked by #467

Zero Truncated Binomial. param_estimate and stats_tbl #469

spsanderson opened this issue May 3, 2024 · 0 comments
Assignees
Labels
enhancement New feature or request question Further information is requested

Comments

@spsanderson
Copy link
Owner

spsanderson commented May 3, 2024

Parameter Estimate Function

Function:

#' Estimate Zero Truncated Binomial Parameters
#'
#' @family Parameter Estimation
#' @family Binomial
#' @family Zero Truncated Distribution
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will attempt to estimate the zero truncated  
#' binomial size and prob parameters given some vector of values.
#'
#' @description The function will return a list output by default, and  if the parameter
#' `.auto_gen_empirical` is set to `TRUE` then the empirical data given to the
#' parameter `.x` will be run through the `tidy_empirical()` function and combined
#' with the estimated binomial data.
#'
#' One method of estimating the parameters is done via:
#' -  MLE via \code{\link[stats]{optim}} function.
#'
#' @param .x The vector of data to be passed to the function.
#' @param .auto_gen_empirical This is a boolean value of TRUE/FALSE with default
#' set to TRUE. This will automatically create the `tidy_empirical()` output
#' for the `.x` parameter and use the `tidy_combine_distributions()`. The user
#' can then plot out the data using `$combined_data_tbl` from the function output.
#'
#' @examples
#' library(dplyr)
#' library(ggplot2)
#' library(actuar)
#' 
#' x <- as.integer(mtcars$mpg)
#' output <- util_zero_truncated_binomial_param_estimate(x)
#'
#' output$parameter_tbl
#'
#' output$combined_data_tbl |>
#'   tidy_combined_autoplot()
#'
#' set.seed(123)
#' t <- rztbinom(100, 10, .1)
#' util_zero_truncated_binomial_param_estimate(t)$parameter_tbl
#'
#' @return
#' A tibble/list
#' 
#' @name util_zero_truncated_binomial_param_estimate
NULL

#' @export
#' @rdname util_zero_truncated_binomial_param_estimate

util_zero_truncated_binomial_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {
  
  # Check if actuar library is installed
  if (!requireNamespace("actuar", quietly = TRUE)) {
    stop("The 'actuar' package is needed for this function. Please install it with: install.packages('actuar')")
  }
  
  # Tidyeval ----
  x_term <- as.numeric(.x)
  sum_x <- sum(x_term, na.rm = TRUE)
  minx <- min(x_term)
  maxx <- max(x_term)
  m <- mean(x_term, na.rm = TRUE)
  n <- length(x_term)
  
  # Negative log-likelihood function for zero-truncated binomial distribution
  nll_func <- function(par) {
    n <- par[1]
    p <- par[2]
    if (n <= 0 || p <= 0 || p >= 1) {
      return(-Inf)
    }
    -sum(actuar::dztbinom(x_term, size = n, prob = p, log = TRUE))
  }
  
  # Initial parameter guesses 
  initial_params <- c(size = max(x_term), prob = 0.5)  # Adjust based on your data
  
  # Optimization using optim()
  optim_result <- optim(
    par = initial_params, 
    fn = nll_func
  ) |>
    suppressWarnings()
  
  # Extract estimated parameters
  mle_size <- optim_result$par[1]
  mle_prob <- optim_result$par[2]
  mle_msg  <- optim_result$message
  
  # Create output tibble
  ret <- tibble::tibble(
    dist_type = "Zero-Truncated Binomial",
    samp_size = n,
    min = minx,
    max = maxx,
    mean = m,
    method = "MLE_Optim",
    size = mle_size,
    prob = mle_prob,
    message = mle_msg
  )
  
  # Attach attributes
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "zero_truncated_binomial"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- n
  
  if (.auto_gen_empirical) {
    # Generate empirical data
    # Assuming tidy_empirical and tidy_combine_distributions functions exist
    te <- tidy_empirical(.x = .x)
    td <- tidy_zero_truncated_binomial(
      .n = n,
      .size = round(mle_size, 3),
      .prob = round(mle_prob, 3)
    )
    combined_tbl <- tidy_combine_distributions(te, td)
    
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl = ret
    )
  } else {
    output <- list(
      parameter_tbl = ret
    )
  }
  
  return(output)
}

Example:

> library(dplyr)
> library(ggplot2)
> library(actuar)
> 
> x <- as.integer(mtcars$mpg)
> output <- util_zero_truncated_binomial_param_estimate(x)
> 
> output$parameter_tbl
# A tibble: 1 × 8
  dist_type               samp_size   min   max  mean method     size  prob
  <chr>                       <int> <dbl> <dbl> <dbl> <chr>     <dbl> <dbl>
1 Zero-Truncated Binomial        32    10    33  19.7 MLE_Optim    33 0.597
> 
> output$combined_data_tbl |>
+   tidy_combined_autoplot()
> 
> set.seed(123)
> t <- rztbinom(100, 10, .1)
> util_zero_truncated_binomial_param_estimate(t)$parameter_tbl
# A tibble: 1 × 8
  dist_type               samp_size   min   max  mean method     size  prob
  <chr>                       <int> <dbl> <dbl> <dbl> <chr>     <dbl> <dbl>
1 Zero-Truncated Binomial       100     1     4  1.53 MLE_Optim  4.00 0.500

AIC

Function:

#' Calculate Akaike Information Criterion (AIC) for Zero-Truncated Binomial Distribution
#'
#' This function calculates the Akaike Information Criterion (AIC) for a 
#' zero-truncated binomial (ZTB) distribution fitted to the provided data.
#'
#' @family Utility
#' @author Steven P. Sanderson II, MPH
#'
#' @description
#' This function estimates the parameters (`size` and `prob`) of a ZTB
#' distribution from the provided data using maximum likelihood estimation 
#' (via the `optim()` function), and then calculates the AIC value based on the 
#' fitted distribution. 
#'
#' @param .x A numeric vector containing the data (non-zero counts) to be 
#'   fitted to a ZTB distribution.
#'
#' @details
#' **Initial parameter estimates:** The choice of initial values for `size` 
#' and `prob` can impact the convergence of the optimization. Consider using 
#' prior knowledge or method of moments estimates to obtain reasonable starting 
#' values. 
#'
#' **Optimization method:** The default optimization method used is 
#' "L-BFGS-B," which allows for box constraints to keep the parameters within 
#' valid bounds. You might explore other optimization methods available in 
#' `optim()` for potentially better performance or different constraint 
#' requirements.
#'
#' **Data requirements:** The input data `.x` should consist of non-zero counts, 
#' as the ZTB distribution does not include zero values. Additionally, the 
#' values in `.x` should be less than or equal to the estimated `size` parameter.
#'
#' **Goodness-of-fit:** While AIC is a useful metric for model comparison, it's 
#' recommended to also assess the goodness-of-fit of the chosen ZTB model using
#' visualization (e.g., probability plots, histograms) and other statistical 
#' tests (e.g., chi-square goodness-of-fit test) to ensure it adequately 
#' describes the data.
#'
#' @examples
#' library(actuar)
#' 
#' # Example data
#' set.seed(123)
#' x <- rztbinom(30, size = 10, prob = 0.4)
#' 
#' # Calculate AIC
#' util_zero_truncated_binomial_aic(x)
#'
#' @return The AIC value calculated based on the fitted ZTB distribution to 
#'   the provided data.
#'
#' @name util_zero_truncated_binomial_aic
NULL

#' @export
#' @rdname util_zero_truncated_binomial_aic

util_zero_truncated_binomial_aic <- function(.x) {
  # Check if actuar library is installed
  if (!requireNamespace("actuar", quietly = TRUE)) {
    stop("The 'actuar' package is needed for this function. Please install it with: install.packages('actuar')")
  }
  
  # Tidyeval
  x <- as.numeric(.x)
  
  # Negative log-likelihood function for zero-truncated binomial distribution
  neg_log_lik_rztbinom <- function(params, data) {
    size <- params[1]
    prob <- params[2]
    n <- length(data)
    -sum(actuar::dztbinom(data, size = size, prob = prob, log = TRUE))
  }
  
  # Initial parameter guesses (adjust if needed)
  pe <- util_zero_truncated_binomial_param_estimate(x)$parameter_tbl
  
  # Fit zero-truncated binomial distribution to data
  fit_rztbinom <- optim(
    par = c(size = round(pe$size, 3), prob = round(pe$prob, 3)), 
    fn = neg_log_lik_rztbinom,
    data = x
  ) |>
    suppressWarnings()
  
  # Extract log-likelihood and number of parameters
  logLik_rztbinom <- round(-fit_rztbinom$value, 4)
  k_rztnbinom <- 2  # Number of parameters (size and prob)
  
  # Calculate AIC
  AIC_rztbinom <- 2 * k_rztnbinom - 2 * logLik_rztbinom
  
  # Return AIC value
  return(AIC_rztbinom)
}

Example:

> library(actuar)
> 
> # Example data
> set.seed(123)
> x <- rztbinom(30, size = 10, prob = 0.4)
> 
> # Calculate AIC
> util_zero_truncated_binomial_aic(x)
[1] 116.5688

Stats Tibble

Function:

Example:

@spsanderson spsanderson changed the title Zero Truncated Binomial Zero Truncated Binomial. param_estimate and stats_tbl May 3, 2024
@spsanderson spsanderson self-assigned this May 3, 2024
@spsanderson spsanderson added the enhancement New feature or request label May 3, 2024
@spsanderson spsanderson added this to the TidyDensity 1.4.1 milestone May 3, 2024
@spsanderson spsanderson moved this from Todo to In Progress in @spsanderson's Repository Issue Overview May 3, 2024
@spsanderson spsanderson added the question Further information is requested label May 6, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request question Further information is requested
Development

No branches or pull requests

1 participant