From 7a5233dc4c4b61d617af54c033c5188ec503e16f Mon Sep 17 00:00:00 2001
From: Alexey Sergushichev <alsergbox@gmail.com>
Date: Mon, 5 Aug 2024 11:06:33 -0500
Subject: [PATCH] check for empty strings in gene names, fixes #149

---
 DESCRIPTION                         |  2 +-
 NEWS                                |  3 +++
 R/fgsea.R                           | 36 +++--------------------------
 R/fgseaORA.R                        | 25 ++++----------------
 R/geseca-utils.R                    | 13 ++---------
 tests/testthat/test_gsea_analysis.R |  4 ++++
 6 files changed, 18 insertions(+), 65 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index fe06714..3f3dcb6 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: fgsea
 Title: Fast Gene Set Enrichment Analysis
-Version: 1.31.2
+Version: 1.31.3
 Authors@R: c(person("Gennady", "Korotkevich", role = "aut"),
              person("Vladimir", "Sukhov", role = "aut"),
              person("Nikolay", "Budin", role = "ctb"),
diff --git a/NEWS b/NEWS
index 3ca2aa2..38ab4c3 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,6 @@
+CHANGES IN VERSION 1.31.3
+* check for empty strings in gene names
+
 CHANGES IN VERSION 1.31.2
 * fora() provides fold enrichment scores for more effective prioritisation of results
 
diff --git a/R/fgsea.R b/R/fgsea.R
index 7bd512b..7865541 100644
--- a/R/fgsea.R
+++ b/R/fgsea.R
@@ -39,26 +39,6 @@ fgsea <- function(pathways, stats, minSize = 1, maxSize = length(stats)-1, gseaP
 
 
 preparePathwaysAndStats <- function(pathways, stats, minSize, maxSize, gseaParam, scoreType){
-    # Error if pathways is not a list
-    if (!is.list(pathways)) {
-        stop("pathways should be a list with each element containing names of the stats argument")
-    }
-
-    # Error if stats is not named
-    if (is.null(names(stats))) {
-        stop("stats should be named")
-    }
-
-    # Error if stats names are NA
-    if (any(is.na(names(stats)))) {
-        stop("NAs in names(stats) are not allowed")
-    }
-
-    # Error for duplicate gene names
-    if (any(duplicated(names(stats)))) {
-        stop("Duplicate names(stats) are not allowed")
-    }
-
     # Error if stats are non-finite
     if (any(!is.finite(stats))){
         stop("Not all stats values are finite numbers")
@@ -81,21 +61,11 @@ preparePathwaysAndStats <- function(pathways, stats, minSize, maxSize, gseaParam
     stats <- sort(stats, decreasing=TRUE)
     stats <- abs(stats) ^ gseaParam
 
+    res <- preparePathways(pathways, universe=names(stats), minSize, maxSize)
 
-    minSize <- max(minSize, 1)
-    maxSize <- min(maxSize, length(stats)-1)
+    res$stats <- stats
 
-    pathwaysFiltered <- lapply(pathways, function(p) { unique(na.omit(fmatch(p, names(stats)))) })
-    pathwaysSizes <- sapply(pathwaysFiltered, length)
-
-    toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize)
-
-    pathwaysFiltered <- pathwaysFiltered[toKeep]
-    pathwaysSizes <- pathwaysSizes[toKeep]
-
-    list(filtered=pathwaysFiltered,
-         sizes=pathwaysSizes,
-         stats=stats)
+    res
 }
 
 
diff --git a/R/fgseaORA.R b/R/fgseaORA.R
index c80339a..3376e00 100644
--- a/R/fgseaORA.R
+++ b/R/fgseaORA.R
@@ -22,25 +22,12 @@
 #' data(exampleRanks)
 #' foraRes <- fora(examplePathways, genes=tail(names(exampleRanks), 200), universe=names(exampleRanks))
 fora <- function(pathways, genes, universe, minSize=1, maxSize=length(universe)-1) {
-    # Error if pathways is not a list
-    if (!is.list(pathways)) {
-        stop("pathways should be a list with each element containing genes from the universe")
-    }
-
-    # Warning message for duplicate gene names
-    if (any(duplicated(universe))) {
-        warning("There were duplicate genes in universe, they were collapsed")
-        universe <- unique(universe)
-    }
-
-    minSize <- max(minSize, 1)
+    pp <- preparePathways(pathways, universe, minSize, maxSize)
+    pathwaysFiltered <- pp$filtered
+    pathwaysSizes <- pp$sizes
 
-    pathwaysFiltered <- lapply(pathways, function(p) { unique(na.omit(fmatch(p, universe))) })
-    pathwaysSizes <- sapply(pathwaysFiltered, length)
 
-    toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize)
-
-    if (length(toKeep) == 0){
+    if (length(pathwaysFiltered) == 0){
         return(data.table(pathway=character(),
                           pval=numeric(),
                           padj=numeric(),
@@ -50,13 +37,11 @@ fora <- function(pathways, genes, universe, minSize=1, maxSize=length(universe)-
                           overlapGenes=list()))
     }
 
-    pathwaysFiltered <- pathwaysFiltered[toKeep]
-    pathwaysSizes <- pathwaysSizes[toKeep]
-
 
     if (!all(genes %in% universe)) {
         warning("Not all of the input genes belong to the universe, such genes were removed")
     }
+
     genesFiltered <- unique(na.omit(fmatch(genes, universe)))
 
     overlaps <- lapply(pathwaysFiltered, intersect, genesFiltered)
diff --git a/R/geseca-utils.R b/R/geseca-utils.R
index e9c21fa..8880fc8 100644
--- a/R/geseca-utils.R
+++ b/R/geseca-utils.R
@@ -26,18 +26,9 @@ checkGesecaArgs <- function(E, pathways){
 }
 
 gesecaPreparePathways <- function(E, pathways, minSize, maxSize){
-    minSize <- max(minSize, 1)
-    maxSize <- min(nrow(E) - 1, maxSize)
+    res <- preparePathways(pathways, universe = rownames(E), minSize, maxSize)
 
-    pathwaysFiltered <- lapply(pathways, function(p) {unique(na.omit(fmatch(p, rownames(E))))})
-    pathwaysSizes <- sapply(pathwaysFiltered, length)
-
-    toKeep <- which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize)
-    pathwaysFiltered <- pathwaysFiltered[toKeep]
-    pathwaysSizes <- pathwaysSizes[toKeep]
-
-    return(list(filtered=pathwaysFiltered,
-                sizes=pathwaysSizes))
+    return(res)
 }
 
 
diff --git a/tests/testthat/test_gsea_analysis.R b/tests/testthat/test_gsea_analysis.R
index 931414c..d7f2e23 100644
--- a/tests/testthat/test_gsea_analysis.R
+++ b/tests/testthat/test_gsea_analysis.R
@@ -126,6 +126,10 @@ test_that("fgseaSimple correctly checks gene names", {
     names(ranks)[41] <- NA
     expect_error(fgseaSimple(examplePathways, ranks, nperm=100, minSize=10, maxSize=50, nproc=1))
 
+    ranks <- exampleRanks
+    names(ranks)[41] <- ""
+    expect_error(fgseaSimple(examplePathways, ranks, nperm=100, minSize=10, maxSize=50, nproc=1))
+
     ranks <- unname(exampleRanks)
     expect_error(fgseaSimple(examplePathways, ranks, nperm=100, minSize=10, maxSize=50, nproc=1))