Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

T, also need param_estimate #480

Closed
Tracked by #467
spsanderson opened this issue May 3, 2024 · 0 comments
Closed
Tracked by #467

T, also need param_estimate #480

spsanderson opened this issue May 3, 2024 · 0 comments
Assignees
Labels
enhancement New feature or request

Comments

@spsanderson
Copy link
Owner

spsanderson commented May 3, 2024

Param Estimate

Function:

#' Estimate t Distribution Parameters
#'
#' @family Parameter Estimation
#' @family t Distribution
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will attempt to estimate the t distribution parameters
#' given some vector of values produced by `rt()`. The estimation method
#' uses both method of moments and maximum likelihood estimation.
#'
#' @param .x The vector of data to be passed to the function, where the data
#' comes from the `rt()` function.
#' @param .auto_gen_empirical This is a boolean value of TRUE/FALSE with default
#' set to TRUE. This will automatically create the `tidy_empirical()` output
#' for the `.x` parameter and use the `tidy_combine_distributions()`. The user
#' can then plot out the data using `$combined_data_tbl` from the function output.
#'
#' @examples
#' library(dplyr)
#' library(ggplot2)
#'
#' set.seed(123)
#' x <- rt(100, df = 10, ncp = 0.5)
#' output <- util_t_param_estimate(x)
#'
#' output$parameter_tbl
#'
#' output$combined_data_tbl |>
#'   tidy_combined_autoplot()
#'
#' @return
#' A tibble/list
#'
#' @export
#'

util_t_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {
  
  # Tidyeval ----
  x_term <- as.numeric(.x)
  n <- length(x_term)
  
  # Checks ----
  if (!inherits(x_term, "numeric")) {
    rlang::abort(
      message = "The '.x' parameter must be numeric.",
      use_cli_format = TRUE
    )
  }
  
  # Method of Moments Estimation ----
  m <- mean(x_term)
  s <- sd(x_term)
  df_mme <- n * s^2 / (n-1)
  
  # Initial parameter guesses for MLE
  start_params <- c(df = df_mme, ncp = m)
  
  # Negative Log Likelihood Function for the t-distribution
  nll <- function(params) {
    df <- params[1]
    ncp <- params[2]
    if (df <= 0) return(Inf) # Ensure df is positive
    -sum(stats::dt(x_term, df, ncp, log = TRUE))
  }
  
  # Minimize Negative Log Likelihood
  optim_res <- stats::optim(start_params, nll, method = "CG")
  
  # Estimated Parameters
  optim_df <- round(optim_res$par[1], 3) |> unname()
  optim_ncp <- round(optim_res$par[2], 3) |> unname()
  
  # Return Tibble ----
  if (.auto_gen_empirical) {
    te <- tidy_empirical(.x = x_term)
    td_mme <- tidy_t(.n = n, .df = round(df_mme, 3), .ncp = round(m, 3))
    td_optim <- tidy_t(.n = n, .df = round(optim_df, 3), .ncp = round(optim_ncp, 3))
    combined_tbl <- tidy_combine_distributions(te, td_mme, td_optim)
  }
  
  ret <- dplyr::tibble(
    dist_type = rep("T Distribution", 2),
    samp_size = rep(n, 2),
    mean      = m,
    variance  = s^2,
    method    = c("MME", "MLE"),
    df_est    = c(df_mme, optim_df),
    ncp_est   = c(m, optim_ncp)
  )
  
  # Return ----
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "t_distribution"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- length(x_term)
  
  if (.auto_gen_empirical) {
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl     = ret
    )
  } else {
    output <- list(
      parameter_tbl = ret
    )
  }
  
  return(output)
}

Example:

> set.seed(123)
> x <- rt(100, df = 10, ncp = 0.5)
> output <- util_t_param_estimate(x)
There were 50 or more warnings (use warnings() to see the first 50)
> 
> output$parameter_tbl
# A tibble: 2 × 7
  dist_type      samp_size  mean variance method df_est ncp_est
  <chr>              <int> <dbl>    <dbl> <chr>   <dbl>   <dbl>
1 t Distribution       100 0.612    0.949 MME     0.959   0.612
2 t Distribution       100 0.612    0.949 MLE     8.32    0.571
> 
> output$combined_data_tbl |>
+   tidy_combined_autoplot()

image

AIC

Function:

#' Calculate Akaike Information Criterion (AIC) for t Distribution
#'
#' This function calculates the Akaike Information Criterion (AIC) for a t
#' distribution fitted to the provided data.
#'
#' @family Utility
#' @author Steven P. Sanderson II, MPH
#'
#' @description
#' This function estimates the parameters of a t distribution from the provided
#' data using maximum likelihood estimation, and then calculates the AIC value
#' based on the fitted distribution.
#'
#' @param .x A numeric vector containing the data to be fitted to a t
#' distribution.
#'
#' @return The AIC value calculated based on the fitted t distribution to the
#' provided data.
#'
#' @details
#' This function fits a t distribution to the input data using maximum
#' likelihood estimation and then computes the Akaike Information Criterion (AIC)
#' based on the fitted distribution.
#'
#' @examples
#' # Generate t-distributed data
#' set.seed(123)
#' x <- rt(100, df = 5, ncp = 0.5)
#'
#' # Calculate AIC for the generated data
#' util_t_aic(x)
#'
#' @seealso
#' \code{\link[stats]{rt}} for generating t-distributed data,
#' \code{\link[stats]{optim}} for optimization.
#'
#' @name util_t_aic
NULL

#' @export
#' @rdname util_t_aic

util_t_aic <- function(.x) {
  # Tidyeval
  x_term <- as.numeric(.x)
  
  # Assume we have a utility function that estimates t-distribution parameters
  # util_t_param_estimate needs to be defined as in previous example
  pe <- util_t_param_estimate(x_term)$parameter_tbl |>
    dplyr::filter(method == "MLE")
  
  # Negative log-likelihood function for the t-distribution
  nll <- function(params) {
    df <- params[1]
    ncp <- params[2]
    if (df <= 0) return(Inf) # return Inf if params are not valid
    -sum(stats::dt(x_term, df, ncp, log = TRUE))
  }
  
  # Use optim to minimize the negative log-likelihood
  # Starting parameters from the parameter estimation function's MLE output
  start_params <- c(df = pe$df_est, ncp = pe$ncp_est) # Using MLE results for initial values
  
  fit_t <- stats::optim(start_params, nll, method = "CG") # Degrees of freedom must be positive
  
  # Extract log-likelihood and number of parameters
  logLik_t <- -fit_t$value
  k_t <- 2 # Number of parameters for t distribution (df, ncp)
  
  # Calculate AIC
  AIC_t <- 2 * k_t - 2 * logLik_t
  
  # Return AIC
  return(AIC_t)
}

Example:

> set.seed(123)
> x <- rt(100, df = 5, ncp = 0.5)
> 
> # Calculate AIC for the generated data
> util_t_aic(x)
[1] 305.77

# Check against fitdistrplust
> fitdistrplus::fitdist(x, "t", start = list(df = pe$df_est, ncp = pe$ncp_est))$aic
[1] 305.763
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Development

No branches or pull requests

1 participant