Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Paralogistic, also need param_estimate and stats_tbl #478

Closed
Tracked by #467
spsanderson opened this issue May 3, 2024 · 0 comments
Closed
Tracked by #467

Paralogistic, also need param_estimate and stats_tbl #478

spsanderson opened this issue May 3, 2024 · 0 comments
Assignees
Labels
enhancement New feature or request

Comments

@spsanderson
Copy link
Owner

spsanderson commented May 3, 2024

Param Estimate

Function:

#' Estimate Paralogistic Parameters
#'
#' @family Parameter Estimation
#' @family Paralogistic
#'
#' @details This function will attempt to estimate the paralogistic shape and rate
#' parameters given some vector of values.
#'
#' @description The function will return a list output by default, and if the parameter
#' `.auto_gen_empirical` is set to `TRUE` then the empirical data given to the
#' parameter `.x` will be run through the `tidy_empirical()` function and combined
#' with the estimated paralogistic data.
#'
#' The method of parameter estimation is:
#' -  MLE
#'
#' @param .x The vector of data to be passed to the function.
#' @param .auto_gen_empirical This is a boolean value of TRUE/FALSE with default
#' set to TRUE. This will automatically create the `tidy_empirical()` output
#' for the `.x` parameter and use the `tidy_combine_distributions()`. The user
#' can then plot out the data using `$combined_data_tbl` from the function output.
#'
#' @examples
#' library(dplyr)
#' library(ggplot2)
#'
#' x <- mtcars$mpg
#' output <- util_paralogistic_param_estimate(x)
#'
#' output$parameter_tbl
#'
#' output$combined_data_tbl |>
#'   tidy_combined_autoplot()
#'
#' t <- tidy_paralogistic(50, 2.5, 1.4)[["y"]]
#' util_paralogistic_param_estimate(t)$parameter_tbl
#'
#' @return
#' A tibble/list
#'
#' @export
#'

util_paralogistic_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {

  # Tidyeval ----
  x_term <- as.numeric(.x)
  minx <- min(x_term)
  maxx <- max(x_term)
  n <- length(x_term)
  unique_terms <- length(unique(x_term))

  # Checks ----
  if (n < 2 || unique_terms < 2) {
    rlang::abort(
      message = "The data must have at least two (2) unique data points.",
      use_cli_format = TRUE
    )
  }

  # Get initial parameter estimates
  mean_x <- mean(x_term, na.rm = TRUE)
  var_x <- var(x_term, na.rm = TRUE)
  shape_mme <- 2 * mean_x^2 / (var_x - mean_x^2)
  rate_mme <- 2 * mean_x / (var_x - mean_x^2)
  # shape_mmue <- 2 * mean_x^2 / (var_x * (n - 1) / n - mean_x^2) |> abs()
  # rate_mmue <- 2 * mean_x / (var_x * (n - 1) / n - mean_x^2) |> abs()

  # MLE
  neg_log_lik_paralogis <- function(par, data) {
    shape <- par[1]
    rate <- par[2]
    -sum(actuar::dparalogis(data, shape = shape, rate = rate, log = TRUE))
  }

  mle_params <- stats::optim(
    c(shape_mme, rate_mme),
    neg_log_lik_paralogis,
    data = x_term,
    method = "L-BFGS-B",
    lower = c(1e-10, 1e-10)
  )$par

  shape_mle <- mle_params[[1]]
  rate_mle <- mle_params[[2]]

  # Return Tibble ----
  if (.auto_gen_empirical) {
    te <- tidy_empirical(.x = x_term)
    # td_mme <- tidy_paralogistic(
    #   .n = n, .shape = round(shape_mme, 3),
    #   .rate = round(rate_mme, 3)
    # )
    td_mle <- tidy_paralogistic(
      .n = n, .shape = round(shape_mle, 3),
      .rate = round(rate_mle, 3)
    )
    # td_mmue <- tidy_paralogistic(
    #   .n = n, .shape = round(shape_mmue, 3),
    #   .rate = round(rate_mmue, 3)
    # )
    combined_tbl <- tidy_combine_distributions(te, td_mle)
  }

  ret <- dplyr::tibble(
    dist_type = "Paralogistic",
    samp_size = n,
    min = minx,
    max = maxx,
    mean = mean_x,
    var = var_x,
    method = "MLE",
    shape = shape_mle,
    rate = rate_mle,
    shape_rate_ratio = c(shape_mle / rate_mle)
  )

  # Return ----
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "paralogistic"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- n

  if (.auto_gen_empirical) {
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl     = ret
    )
  } else {
    output <- list(
      parameter_tbl = ret
    )
  }

  return(output)
}

Example:

> x <- mtcars$mpg
> output <- util_paralogistic_param_estimate(x)
> 
> output$parameter_tbl
# A tibble: 1 × 10
  dist_type    samp_size   min   max  mean   var method shape   rate shape_rate_ratio
  <chr>            <int> <dbl> <dbl> <dbl> <dbl> <chr>  <dbl>  <dbl>            <dbl>
1 Paralogistic        32  10.4  33.9  20.1  36.3 MLE     4.14 0.0336             123.
> 
> output$combined_data_tbl |>
+   tidy_combined_autoplot()
> 
> t <- tidy_paralogistic(50, 2.5, 1.4)[["y"]]
> util_paralogistic_param_estimate(t)$parameter_tbl
# A tibble: 1 × 10
  dist_type    samp_size    min   max  mean    var method shape  rate shape_rate_ratio
  <chr>            <int>  <dbl> <dbl> <dbl>  <dbl> <chr>  <dbl> <dbl>            <dbl>
1 Paralogistic        50 0.0358  1.29 0.504 0.0637 MLE     2.63  1.36             1.93> x <- mtcars$mpg
> output <- util_paralogistic_param_estimate(x)
> 
> output$parameter_tbl
# A tibble: 1 × 10
  dist_type    samp_size   min   max  mean   var method shape   rate shape_rate_ratio
  <chr>            <int> <dbl> <dbl> <dbl> <dbl> <chr>  <dbl>  <dbl>            <dbl>
1 Paralogistic        32  10.4  33.9  20.1  36.3 MLE     4.14 0.0336             123.
> 
> output$combined_data_tbl |>
+   tidy_combined_autoplot()
> 
> t <- tidy_paralogistic(50, 2.5, 1.4)[["y"]]
> util_paralogistic_param_estimate(t)$parameter_tbl
# A tibble: 1 × 10
  dist_type    samp_size    min   max  mean    var method shape  rate shape_rate_ratio
  <chr>            <int>  <dbl> <dbl> <dbl>  <dbl> <chr>  <dbl> <dbl>            <dbl>
1 Paralogistic        50 0.0358  1.29 0.504 0.0637 MLE     2.63  1.36             1.93

image

AIC

Function:

#' Calculate Akaike Information Criterion (AIC) for Paralogistic Distribution
#'
#' This function calculates the Akaike Information Criterion (AIC) for a paralogistic distribution fitted to the provided data.
#'
#' @family Utility
#' @family Paralogistic
#' @author Steven P. Sanderson II, MPH
#'
#' @description
#' This function estimates the shape and rate parameters of a paralogistic
#' distribution from the provided data using maximum likelihood estimation,
#' and then calculates the AIC value based on the fitted distribution.
#'
#' @param .x A numeric vector containing the data to be fitted to a paralogistic distribution.
#'
#' @details
#' This function fits a paralogistic distribution to the provided data using maximum
#' likelihood estimation. It estimates the shape and rate parameters
#' of the paralogistic distribution using maximum likelihood estimation. Then, it
#' calculates the AIC value based on the fitted distribution.
#'
#' Initial parameter estimates: The function uses the method of moments estimates
#' as starting points for the shape and rate parameters of the paralogistic
#' distribution.
#'
#' Optimization method: The function uses the optim function for optimization.
#' You might explore different optimization methods within optim for potentially
#' better performance.
#'
#' Goodness-of-fit: While AIC is a useful metric for model comparison, it's
#' recommended to also assess the goodness-of-fit of the chosen model using
#' visualization and other statistical tests.
#'
#' @examples
#' # Example 1: Calculate AIC for a sample dataset
#' set.seed(123)
#' x <- tidy_paralogistic(30, .shape = 2, .rate = 1)[["y"]]
#' util_paralogistic_aic(x)
#'
#' @return
#' The AIC value calculated based on the fitted paralogistic distribution to the provided data.
#'
#' @name util_paralogistic_aic
NULL

#' @export
#' @rdname util_paralogistic_aic
util_paralogistic_aic <- function(.x) {
  # Tidyeval
  x <- as.numeric(.x)

  # Negative log-likelihood function for paralogistic distribution
  neg_log_lik_paralogis <- function(par, data) {
    shape <- par[1]
    rate <- par[2]
    -sum(actuar::dparalogis(data, shape = shape, rate = rate, log = TRUE))
  }

  # Get initial parameter estimates: method of moments
  pe <- TidyDensity::util_paralogistic_param_estimate(x)$parameter_tbl |>
    subset(method == "MLE")

  # Fit paralogistic distribution using optim
  fit_paralogis <- stats::optim(
    c(pe$shape, pe$rate),
    neg_log_lik_paralogis,
    data = x,
    method = "L-BFGS-B",
    lower = c(1e-10, 1e-10)
  )

  # Extract log-likelihood and number of parameters
  logLik_paralogis <- -fit_paralogis$value
  k_paralogis <- 2 # Number of parameters for paralogistic distribution (shape and rate)

  # Calculate AIC
  AIC_paralogis <- 2 * k_paralogis - 2 * logLik_paralogis

  # Return AIC
  return(AIC_paralogis)
}

Example:

``r

set.seed(123)
x <- tidy_paralogistic(30, .shape = 2, .rate = 1)[["y"]]
util_paralogistic_aic(x)
[1] 31.93101


# Stats Tibble

_Function:_
```r
#' Distribution Statistics for Paralogistic Distribution
#'
#' @family Paralogistic
#' @family Distribution Statistics
#'
#' @details This function will take in a tibble and returns the statistics
#' of the given type of `tidy_` distribution. It is required that data be
#' passed from a `tidy_` distribution function.
#'
#' @description Returns distribution statistics in a tibble.
#'
#' @param .data The data being passed from a `tidy_` distribution function.
#'
#' @examples
#' library(dplyr)
#'
#' set.seed(123)
#' tidy_paralogistic(.n = 50, .shape = 5, .rate = 6) |>
#'   util_paralogistic_stats_tbl() |>
#'   glimpse()
#'
#' @return
#' A tibble
#'
#' @name util_paralogistic_stats_tbl
NULL

#' @export
#' @rdname util_paralogistic_stats_tbl

util_paralogistic_stats_tbl <- function(.data) {

  # Immediate check for tidy_ distribution function
  if (!"tibble_type" %in% names(attributes(.data))) {
    rlang::abort(
      message = "You must pass data from the 'tidy_dist' function.",
      use_cli_format = TRUE
    )
  }

  if (attributes(.data)$tibble_type != "tidy_paralogistic") {
    rlang::abort(
      message = "You must use 'tidy_paralogistic()'",
      use_cli_format = TRUE
    )
  }

  # Data
  data_tbl <- dplyr::as_tibble(.data)

  atb <- attributes(data_tbl)
  shape <- atb$.shape
  rate <- atb$.rate

  stat_mean <- ifelse(shape > 1, rate / (shape - 1), Inf)
  stat_mode <- rate / (shape + 1)
  stat_coef_var <- ifelse(shape > 2, sqrt((shape) / ((shape - 2))), Inf)
  stat_sd <- ifelse(shape > 2, sqrt((rate^2) * shape / ((shape - 1)^2 * (shape - 2))), Inf)
  stat_skewness <- ifelse(shape > 3, 2 * (2 * shape - 1) / (shape - 3) * sqrt((shape - 2) / shape), "undefined")
  stat_kurtosis <- ifelse(shape > 4, 6 * (shape^3 + shape^2 - 6 * shape - 2) / (shape * (shape - 3) * (shape - 4)), "undefined")

  # Data Tibble
  ret <- dplyr::tibble(
    tidy_function = atb$tibble_type,
    function_call = atb$dist_with_params,
    distribution = "Paralogistic",
    distribution_type = "Continuous",
    points = atb$.n,
    simulations = atb$.num_sims,
    mean = stat_mean,
    mode_lower = stat_mode,
    range = paste0("0 to Inf"),
    std_dv = stat_sd,
    coeff_var = stat_coef_var,
    skewness = stat_skewness,
    kurtosis = stat_kurtosis,
    computed_std_skew = tidy_skewness_vec(data_tbl$y),
    computed_std_kurt = tidy_kurtosis_vec(data_tbl$y),
    ci_lo = ci_lo(data_tbl$y),
    ci_hi = ci_hi(data_tbl$y)
  )

  # Return
  return(ret)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Development

No branches or pull requests

1 participant