From fcc374375b17b5bb38759c335e5ed0cc1528c58b Mon Sep 17 00:00:00 2001
From: Jack Leary <jrleary@live.unc.edu>
Date: Wed, 13 Nov 2024 13:08:26 -0500
Subject: [PATCH] fixed score test and made it the default testing method for
 GEEs

---
 R/scoreTestGEE.R   | 27 +++++++++++++--------------
 R/testDynamic.R    | 44 ++++++++++++++++++++++----------------------
 man/testDynamic.Rd |  4 ++--
 3 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/R/scoreTestGEE.R b/R/scoreTestGEE.R
index a9085a7..313b767 100644
--- a/R/scoreTestGEE.R
+++ b/R/scoreTestGEE.R
@@ -7,8 +7,8 @@
 #' @importFrom Matrix bdiag
 #' @param mod.1 The model under the alternative hypothesis. Must be of class \code{geem}. Defaults to NULL.
 #' @param mod.0 The model under the null hypothesis. Must be of class \code{geem}. Defaults to NULL.
-#' @param alt.df The dataframe used to fit the alternative model. Defaults to NULL. 
-#' @param null.df The dataframe used to fit the null model. Defaults to NULL. 
+#' @param alt.df The dataframe used to fit the alternative model. Defaults to NULL.
+#' @param null.df The dataframe used to fit the null model. Defaults to NULL.
 #' @param id.vec A vector of subject IDs to use as input to \code{\link{marge2}}. Defaults to NULL.
 #' @param cor.structure A string specifying the working correlation structure used to fit each model. Must be one of "ar1", "independence", or "exchangeable". Defaults to "ar1".
 #' @return A list containing the Score test statistic, a \emph{p}-value, and the degrees of freedom used in the test.
@@ -22,13 +22,13 @@
 #' @seealso \code{\link{waldTestGEE}}
 #' @seealso \code{\link{modelLRT}}
 
-scoreTestGEE <- function(mod.1 = NULL, 
-                         mod.0 = NULL, 
-                         alt.df = NULL, 
-                         null.df = NULL, 
-                         id.vec = NULL, 
+scoreTestGEE <- function(mod.1 = NULL,
+                         mod.0 = NULL,
+                         alt.df = NULL,
+                         null.df = NULL,
+                         id.vec = NULL,
                          cor.structure = "ar1") {
-  # check inputs 
+  # check inputs
   if (is.null(mod.1) || is.null(mod.0) || is.null(alt.df) || is.null(null.df) || is.null(id.vec)) { stop("Please provide all inputs to scoreTestGEE().") }
   if (inherits(mod.1, "try-error") || inherits(mod.0, "try-error")) {
     res <- list(Score_Stat = NA_real_,
@@ -52,10 +52,9 @@ scoreTestGEE <- function(mod.1 = NULL,
     theta <- as.numeric(gsub("\\)", "", gsub(".*\\(", "", mod.1$FunList$family)))
     X_null <- stats::model.matrix(mod.0, data = null.df)
     X_alt <- stats::model.matrix(mod.1, data = alt.df)
-    r_null <- null.df$Y - stats::predict(mod.0)
+    r_null <- null.df$Y - exp(stats::predict(mod.0))
     p_alt <- ncol(X_alt)
     groups <- unique(id.vec)
-    i <- 1
     W <- K <- vector("list", length = length(groups))
     for (i in seq(groups)) {
       group_idx <- which(id.vec == groups[i])
@@ -70,7 +69,7 @@ scoreTestGEE <- function(mod.1 = NULL,
         R_i <- matrix(rho^abs(outer(seq(n_i), seq(n_i), "-")), nrow = n_i, ncol = n_i)
       }
       # create working covariance matrix V_i
-      mu_i <- stats::predict(mod.0)[group_idx]
+      mu_i <- exp(stats::predict(mod.0)[group_idx])
       V_mu_i <- mu_i * (1 + mu_i / theta)
       A_i <- diag(V_mu_i)
       A_i_sqrt <- sqrt(A_i)  # same as taking the 1/2 power of A_i since A_i is diagonal
@@ -91,15 +90,15 @@ scoreTestGEE <- function(mod.1 = NULL,
     if (inherits(K_inv, "try-error")) {
       K_inv <- eigenMapPseudoInverse(K)
     }
-    # generate score vector 
+    # generate score vector
     U <- phi^(-1) * t(X_alt) %*% W %*% K_inv %*% r_null
-    # generate variance of score vector and invert it   
+    # generate variance of score vector and invert it
     V_U <- phi^(-2) * t(X_alt) %*% W %*% X_alt
     V_U_inv <- try({ eigenMapMatrixInvert(V_U) }, silent = TRUE)
     if (inherits(V_U_inv, "try-error")) {
       V_U_inv <- eigenMapPseudoInverse(V_U)
     }
-    # estimate test statistic and accompanying p-value 
+    # estimate test statistic and accompanying p-value
     S <- t(U) %*% V_U_inv %*% U
     S_df <- ncol(X_alt) - ncol(X_null)
     p_value <- 1 - stats::pchisq(S, df = S_df)
diff --git a/R/testDynamic.R b/R/testDynamic.R
index 7d3c70e..25befb4 100644
--- a/R/testDynamic.R
+++ b/R/testDynamic.R
@@ -25,7 +25,7 @@
 #' @param is.gee Should a GEE framework be used instead of the default GLM? Defaults to FALSE.
 #' @param cor.structure If the GEE framework is used, specifies the desired working correlation structure. Must be one of "ar1", "independence", or "exchangeable". Defaults to "ar1".
 #' @param gee.bias.correction.method (Optional) Specify which small-sample bias correction to be used on the sandwich variance-covariance matrix prior to test statistic estimation. Options are "kc" and "df". Defaults to NULL, indicating the use of the model-based variance.
-#' @param gee.test A string specifying the type of test used to estimate the significance of the full model. Must be one of "wald" or "score". Defaults to "wald".
+#' @param gee.test A string specifying the type of test used to estimate the significance of the full model. Must be one of "wald" or "score". Defaults to "score".
 #' @param is.glmm Should a GLMM framework be used instead of the default GLM? Defaults to FALSE.
 #' @param id.vec If a GEE or GLMM framework is being used, a vector of subject IDs to use as input to \code{\link[geeM]{geem}} or \code{\link[glmmTMB]{glmmTMB}}. Defaults to NULL.
 #' @param glmm.adaptive (Optional) Should the basis functions for the GLMM be chosen adaptively? If not, uses 4 evenly spaced knots. Defaults to TRUE.
@@ -64,7 +64,7 @@ testDynamic <- function(expr.mat = NULL,
                         is.gee = FALSE,
                         cor.structure = "ar1",
                         gee.bias.correction.method = NULL,
-                        gee.test = "wald", 
+                        gee.test = "score",
                         is.glmm = FALSE,
                         glmm.adaptive = TRUE,
                         id.vec = NULL,
@@ -111,10 +111,10 @@ testDynamic <- function(expr.mat = NULL,
   colnames(pt) <- paste0("Lineage_", LETTERS[seq(n_lineages)])
 
   # ensure subject ID vector meets criteria for GEE / GLMM fitting
-  if ((is.gee || is.glmm) && is.null(id.vec)) { stop("You must provide a vector of IDs if you're using GEE / GLMM backends.") }
-  if ((is.gee || is.glmm) && is.unsorted(id.vec)) { stop("Your data must be ordered by subject, please do so before running testDynamic() with the GEE / GLMM backends.") }
+  if ((is.gee || is.glmm) && is.null(id.vec)) { stop("You must provide a vector of IDs if you're using GEE / GLMM modes.") }
+  if ((is.gee || is.glmm) && is.unsorted(id.vec)) { stop("Your data must be ordered by subject, please do so before running testDynamic() with the GEE / GLMM modes.") }
   cor.structure <- tolower(cor.structure)
-  if (is.gee && !(cor.structure %in% c("ar1", "independence", "exchangeable"))) { stop("GEE models require a specified correlation structure.") }
+  if (is.gee && !(cor.structure %in% c("ar1", "independence", "exchangeable"))) { stop("GEE models require a valid correlation structure.") }
   # check GEE testing method
   gee.test <- tolower(gee.test)
   if (is.gee & !gee.test %in% c("wald", "score")) { stop("GEE testing method must be one of score or wald.") }
@@ -147,7 +147,7 @@ testDynamic <- function(expr.mat = NULL,
                                 is_read_only = TRUE)
 
   # build list of objects to prevent from being sent to parallel workers
-  necessary_vars <- c("expr.mat", "genes", "pt", "n.potential.basis.fns", "approx.knot", "is.glmm", "gee.bias.correction.method", "gee.test", 
+  necessary_vars <- c("expr.mat", "genes", "pt", "n.potential.basis.fns", "approx.knot", "is.glmm", "gee.bias.correction.method", "gee.test",
                       "verbose", "n_lineages", "id.vec", "cor.structure", "is.gee", "gee.scale.fix", "glmm.adaptive", "size.factor.offset")
   if (any(ls(envir = .GlobalEnv) %in% necessary_vars)) {
     no_export <- c(ls(envir = .GlobalEnv)[-which(ls(envir = .GlobalEnv) %in% necessary_vars)],
@@ -320,11 +320,11 @@ testDynamic <- function(expr.mat = NULL,
      lineage_list[[j]] <- list(Gene = genes[i],
                                Lineage = LETTERS[j],
                                Test_Stat = NA_real_,
-                               Test_Stat_Type = ifelse(is.gee, 
-                                                       ifelse(gee.test == "wald", "Wald", "Score"), 
+                               Test_Stat_Type = ifelse(is.gee,
+                                                       ifelse(gee.test == "wald", "Wald", "Score"),
                                                        "LRT"),
                                Test_Stat_Note = NA_character_,
-                               Degrees_Freedom = NA_real_, 
+                               Degrees_Freedom = NA_real_,
                                P_Val = NA_real_,
                                LogLik_MARGE = marge_sumy$ll_marge,
                                LogLik_Null = null_sumy$null_ll,
@@ -332,7 +332,7 @@ testDynamic <- function(expr.mat = NULL,
                                Dev_Null = null_sumy$null_dev,
                                Model_Status = mod_status,
                                MARGE_Fit_Notes = marge_sumy$marge_fit_notes,
-                               Null_Fit_Notes = null_sumy$null_fit_notes, 
+                               Null_Fit_Notes = null_sumy$null_fit_notes,
                                Gene_Time = gene_time_end_numeric,
                                MARGE_Summary = marge_sumy$marge_sumy_df,
                                Null_Summary = null_sumy$null_sumy_df,
@@ -350,11 +350,11 @@ testDynamic <- function(expr.mat = NULL,
                                  id.vec = id.vec[lineage_cells],
                                  verbose = verbose)
        } else if (gee.test == "score") {
-         test_res <- scoreTestGEE(mod.1 = marge_mod, 
-                                  mod.0 = null_mod, 
-                                  alt.df = as.data.frame(marge_mod$basis_mtx), 
-                                  null.df = null_mod_df, 
-                                  id.vec = id.vec[lineage_cells], 
+         test_res <- scoreTestGEE(mod.1 = marge_mod,
+                                  mod.0 = null_mod,
+                                  alt.df = as.data.frame(marge_mod$basis_mtx),
+                                  null.df = null_mod_df,
+                                  id.vec = id.vec[lineage_cells],
                                   cor.structure = cor.structure)
        }
      } else {
@@ -363,11 +363,11 @@ testDynamic <- function(expr.mat = NULL,
                             is.glmm = is.glmm)
      }
      # add test stats to result list
-     lineage_list[[j]]$Test_Stat <- ifelse(is.gee, 
-                                           ifelse(gee.test == "wald", test_res$Wald_Stat, test_res$Score_Stat), 
+     lineage_list[[j]]$Test_Stat <- ifelse(is.gee,
+                                           ifelse(gee.test == "wald", test_res$Wald_Stat, test_res$Score_Stat),
                                            test_res$LRT_Stat)
      lineage_list[[j]]$Test_Stat_Note <- test_res$Notes
-     lineage_list[[j]]$Degrees_Freedom <- test_res$DF 
+     lineage_list[[j]]$Degrees_Freedom <- test_res$DF
      lineage_list[[j]]$P_Val <- test_res$P_Val
     }
     names(lineage_list) <- paste0("Lineage_", LETTERS[seq(n_lineages)])
@@ -391,11 +391,11 @@ testDynamic <- function(expr.mat = NULL,
         list(Gene = y,
              Lineage = LETTERS[z],
              Test_Stat = NA_real_,
-             Test_Stat_Type = ifelse(is.gee, 
-                                     ifelse(gee.test == "wald", "Wald", "Score"), 
+             Test_Stat_Type = ifelse(is.gee,
+                                     ifelse(gee.test == "wald", "Wald", "Score"),
                                      "LRT"),
              Test_Stat_Note = NA_character_,
-             Degrees_Freedom = NA_real_, 
+             Degrees_Freedom = NA_real_,
              P_Val = NA_real_,
              LogLik_MARGE = NA_real_,
              LogLik_Null = NA_real_,
@@ -404,7 +404,7 @@ testDynamic <- function(expr.mat = NULL,
              Model_Status = x[1],
              Gene_Time = NA_real_,
              MARGE_Fit_Notes = NA_character_,
-             Null_Fit_Notes = NA_character_, 
+             Null_Fit_Notes = NA_character_,
              MARGE_Summary = NULL,
              Null_Summary = NULL,
              MARGE_Preds = NULL,
diff --git a/man/testDynamic.Rd b/man/testDynamic.Rd
index 3319a7f..d318d19 100644
--- a/man/testDynamic.Rd
+++ b/man/testDynamic.Rd
@@ -12,7 +12,7 @@ testDynamic(
   is.gee = FALSE,
   cor.structure = "ar1",
   gee.bias.correction.method = NULL,
-  gee.test = "wald",
+  gee.test = "score",
   is.glmm = FALSE,
   glmm.adaptive = TRUE,
   id.vec = NULL,
@@ -38,7 +38,7 @@ testDynamic(
 
 \item{gee.bias.correction.method}{(Optional) Specify which small-sample bias correction to be used on the sandwich variance-covariance matrix prior to test statistic estimation. Options are "kc" and "df". Defaults to NULL, indicating the use of the model-based variance.}
 
-\item{gee.test}{A string specifying the type of test used to estimate the significance of the full model. Must be one of "wald" or "score". Defaults to "wald".}
+\item{gee.test}{A string specifying the type of test used to estimate the significance of the full model. Must be one of "wald" or "score". Defaults to "score".}
 
 \item{is.glmm}{Should a GLMM framework be used instead of the default GLM? Defaults to FALSE.}