added score test option instead of wald test

jr-leary7 · Oct 27, 2024 · db34914 · db34914
1 parent a48170b
commit db34914
Show file tree

Hide file tree

Showing 6 changed files with 148 additions and 11 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -136,6 +136,7 @@ importFrom(stats,fitted.values)
 importFrom(stats,hclust)
 importFrom(stats,kmeans)
 importFrom(stats,logLik)
+importFrom(stats,model.matrix)
 importFrom(stats,offset)
 importFrom(stats,p.adjust)
 importFrom(stats,p.adjust.methods)

diff --git a/R/scoreTestGEE.R b/R/scoreTestGEE.R
@@ -0,0 +1,70 @@
+#' Use a Lagrange multiplier (score) test to compare nested GEE models.
+#'
+#' @name scoreTestGEE
+#' @author Jack R. Leary
+#' @description Performs a basic Lagrange multiplier test to determine whether an alternate model is significantly better than a nested null model. This is the GEE equivalent (kind of) of \code{\link{modelLRT}}. Be careful with small sample sizes.
+#' @importFrom stats model.matrix predict pchisq
+#' @param mod.1 The model under the alternative hypothesis. Must be of class \code{geem}. Defaults to NULL.
+#' @param mod.0 The model under the null hypothesis. Must be of class \code{geem}. Defaults to NULL.
+#' @param alt.df The dataframe used to fit the alternative model. Defaults to NULL. 
+#' @param null.df The dataframe used to fit the null model. Defaults to NULL. 
+#' @param sandwich.var Boolean specifying whether the sandwich variance-covariance matrix should be used. Defaults to FALSE.
+#' @return A list containing the Score test statistic, a \emph{p}-value, and the degrees of freedom used in the test.
+#' @details
+#' \itemize{
+#' \item Calculating the test statistic involves taking the inverse of the variance-covariance matrix of the coefficients. Ideally this would be done using the "true" inverse with something like \code{\link{solve}}, \code{\link{qr.solve}}, or \code{\link{chol2inv}}, but in practice this can cause issues when the variance-covariance matrix is near-singular. With this in mind, we use the Moore-Penrose pseudoinverse as implemented in \code{\link[MASS]{ginv}} instead, which leads to more stable results.
+#' \item The \emph{p}-value is calculated using an asymptotic \eqn{\Chi^2} distribution, with the degrees of freedom equal to the number of non-intercept coefficients in the alternative model.
+#' }
+#' @seealso \code{\link[geeM]{geem}}
+#' @seealso \code{\link{waldTestGEE}}
+#' @seealso \code{\link{modelLRT}}
+
+scoreTestGEE <- function(mod.1 = NULL, 
+                         mod.0 = NULL, 
+                         alt.df = NULL, 
+                         null.df = NULL, 
+                         sandwich.var = FALSE) {
+  # check inputs 
+  if (is.null(mod.1) || is.null(mod.0) || is.null(alt.df) || is.null(null.df)) { stop("Please provide all inputs to scoreTestGEE().") }
+  if (inherits(mod.1, "try-error") || inherits(mod.0, "try-error")) {
+    res <- list(Score_Stat = NA_real_,
+                DF = NA_real_,
+                P_Val = NA_real_,
+                Notes = "No test performed due to model failure.")
+    return(res)
+  }
+  mod.1 <- mod.1$final_mod
+  if (!(inherits(mod.1, "geem") && inherits(mod.0, "geem"))) { stop("You must provide two geeM models to scoreTestGee().") }
+  if (length(coef(mod.0)) != 1) { stop("Null GEE model must be intercept-only.") }
+  if (length(coef(mod.1)) <= length(coef(mod.0))) {
+    # can't calculate Score statistic if both models are intercept-only
+    res <- list(Score_Stat = 0,
+                DF = 0,
+                P_Val = 1,
+                Notes = NA_character_)
+  } else {
+    X_null <- stats::model.matrix(mod.0, data = null.df)
+    X_alt <- stats::model.matrix(mod.1, data = alt.df)
+    residuals_null <- null.df$Y - exp(stats::predict(mod.0))
+    if (sandwich.var) {
+      V_null <- as.matrix(mod.0$var)
+    } else {
+      V_null <- as.matrix(mod.0$naiv.var)
+    }
+    X_alt_t <- t(X_alt)
+    U <- X_alt_t %*% residuals_null
+    V_U <- (X_alt_t * as.numeric(V_null)) %*% X_alt
+    V_U_inv <- try({ eigenMapMatrixInvert(V_U) }, silent = TRUE)
+    if (inherits(V_U_inv, "try-error")) {
+      V_U_inv <- eigenMapPseudoInverse(V_U)
+    }
+    test_stat <- as.numeric(t(U) %*% V_U_inv %*% U)
+    df1 <- ncol(X_alt) - ncol(X_null)
+    p_value <- 1 - stats::pchisq(test_stat, df = df1)
+    res <- list(Score_Stat = test_stat,
+                DF = df1,
+                P_Val = p_value,
+                Notes = NA_character_)
+  }
+  return(res)
+}
diff --git a/R/testDynamic.R b/R/testDynamic.R
@@ -63,6 +63,7 @@ testDynamic <- function(expr.mat = NULL,
                         is.gee = FALSE,
                         cor.structure = "ar1",
                         gee.bias.correction.method = NULL,
+                        gee.test = "wald", 
                         is.glmm = FALSE,
                         glmm.adaptive = TRUE,
                         id.vec = NULL,
@@ -113,7 +114,9 @@ testDynamic <- function(expr.mat = NULL,
   if ((is.gee || is.glmm) && is.unsorted(id.vec)) { stop("Your data must be ordered by subject, please do so before running testDynamic() with the GEE / GLMM backends.") }
   cor.structure <- tolower(cor.structure)
   if (is.gee && !(cor.structure %in% c("ar1", "independence", "exchangeable"))) { stop("GEE models require a specified correlation structure.") }
-
+  # check GEE testing method
+  gee.test <- tolower(gee.test)
+  if (is.gee & !gee.test %in% c("wald", "score")) { stop("GEE testing method must be one of score or wald.") }
   # set up time tracking
   start_time <- Sys.time()
 
@@ -143,7 +146,7 @@ testDynamic <- function(expr.mat = NULL,
                                 is_read_only = TRUE)
 
   # build list of objects to prevent from being sent to parallel workers
-  necessary_vars <- c("expr.mat", "genes", "pt", "n.potential.basis.fns", "approx.knot", "is.glmm", "gee.bias.correction.method",
+  necessary_vars <- c("expr.mat", "genes", "pt", "n.potential.basis.fns", "approx.knot", "is.glmm", "gee.bias.correction.method", "gee.test", 
                       "verbose", "n_lineages", "id.vec", "cor.structure", "is.gee", "gee.scale.fix", "glmm.adaptive", "size.factor.offset")
   if (any(ls(envir = .GlobalEnv) %in% necessary_vars)) {
     no_export <- c(ls(envir = .GlobalEnv)[-which(ls(envir = .GlobalEnv) %in% necessary_vars)],
@@ -325,7 +328,9 @@ testDynamic <- function(expr.mat = NULL,
      lineage_list[[j]] <- list(Gene = genes[i],
                                Lineage = LETTERS[j],
                                Test_Stat = NA_real_,
-                               Test_Stat_Type = ifelse(is.gee, "Wald", "LRT"),
+                               Test_Stat_Type = ifelse(is.gee, 
+                                                       ifelse(gee.test == "wald", "Wald", "Score"), 
+                                                       "LRT"),
                                Test_Stat_Note = NA_character_,
                                P_Val = NA_real_,
                                LogLik_MARGE = marge_sumy$ll_marge,
@@ -344,18 +349,28 @@ testDynamic <- function(expr.mat = NULL,
 
      # compute test stat using asymptotic Chi-squared approximation
      if (is.gee) {
-       test_res <- waldTestGEE(mod.1 = marge_mod,
-                               mod.0 = null_mod,
-                               correction.method = gee.bias.correction.method,
-                               id.vec = id.vec[lineage_cells],
-                               verbose = verbose)
+       if (gee.test == "wald") {
+         test_res <- waldTestGEE(mod.1 = marge_mod,
+                                 mod.0 = null_mod,
+                                 correction.method = gee.bias.correction.method,
+                                 id.vec = id.vec[lineage_cells],
+                                 verbose = verbose)
+       } else if (gee.test == "score") {
+         test_res <- scoreTestGEE(mod.1 = marge_mod, 
+                                  mod.0 = null_mod, 
+                                  alt.df = as.data.frame(marge_mod$basis_mtx), 
+                                  null.df = null_mod_df, 
+                                  sandwich.var = ifelse(is.null(gee.bias.correction.method), FALSE, TRUE))
+       }
      } else {
        test_res <- modelLRT(mod.1 = marge_mod,
                             mod.0 = null_mod,
                             is.glmm = is.glmm)
      }
      # add test stats to result list
-     lineage_list[[j]]$Test_Stat <- ifelse(is.gee, test_res$Wald_Stat, test_res$LRT_Stat)
+     lineage_list[[j]]$Test_Stat <- ifelse(is.gee, 
+                                           ifelse(gee.test == "wald", test_res$Wald_Stat, test_res$Score_Stat), 
+                                           test_res$LRT_Stat)
      lineage_list[[j]]$Test_Stat_Note <- test_res$Notes
      lineage_list[[j]]$P_Val <- test_res$P_Val
     }
@@ -380,7 +395,9 @@ testDynamic <- function(expr.mat = NULL,
         list(Gene = y,
              Lineage = LETTERS[z],
              Test_Stat = NA_real_,
-             Test_Stat_Type = ifelse(is.gee, "Wald", "LRT"),
+             Test_Stat_Type = ifelse(is.gee, 
+                                     ifelse(gee.test == "wald", "Wald", "Score"), 
+                                     "LRT"),
              Test_Stat_Note = NA_character_,
              P_Val = NA_real_,
              LogLik_MARGE = NA_real_,

diff --git a/R/waldTestGEE.R b/R/waldTestGEE.R
@@ -27,6 +27,7 @@ waldTestGEE <- function(mod.1 = NULL,
                         id.vec = NULL,
                         verbose = FALSE) {
   # check inputs
+  if (is.null(mod.1) || is.null(mod.0)) { stop("Please provide both models to waldTestGEE().") }
   if (inherits(mod.1, "try-error") || inherits(mod.0, "try-error")) {
     res <- list(Wald_Stat = NA_real_,
                 DF = NA_real_,
@@ -43,7 +44,7 @@ waldTestGEE <- function(mod.1 = NULL,
   }
 
   mod.1 <- mod.1$final_mod
-  if (is.null(mod.1) || is.null(mod.0) || !(inherits(mod.1, "geem") && inherits(mod.0, "geem"))) { stop("You must provide two geeM models to waldTestGee().") }
+  if (!(inherits(mod.1, "geem") && inherits(mod.0, "geem"))) { stop("You must provide two geeM models to waldTestGee().") }
   if (length(coef(mod.0)) != 1) { stop("Null GEE model must be intercept-only.") }
   if (length(coef(mod.1)) <= length(coef(mod.0))) {
     # can't calculate Wald statistic if both models are intercept-only

diff --git a/man/scoreTestGEE.Rd b/man/scoreTestGEE.Rd
diff --git a/man/testDynamic.Rd b/man/testDynamic.Rd