Merge pull request #23 from fmicompbio/add-filtering

Add filtering for Spectronaut
fmicompbio · Jul 21, 2024 · d429fd6 · d429fd6
2 parents cafc672 + 4077dac
commit d429fd6
Show file tree

Hide file tree

Showing 19 changed files with 198 additions and 85 deletions.
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -21,7 +21,7 @@ jobs:
         config:
         - { os: macOS-latest, bioc: 'release', curlConfigPath: '/usr/bin/'}
         - { os: windows-latest, bioc: 'release'}
-        - { os: ubuntu-latest, image: "bioconductor/bioconductor_docker:RELEASE_3_18", cran: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"}
+        - { os: ubuntu-latest, image: "bioconductor/bioconductor_docker:RELEASE_3_19", cran: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"}
 
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
@@ -68,11 +68,10 @@ jobs:
         if: runner.os == 'Linux'
         env:
           RHUB_PLATFORM: linux-x86_64-ubuntu-gcc
-        run: |
-          Rscript -e "remotes::install_github('r-hub/sysreqs')"
-          sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))")
-          sudo -s eval "$sysreqs"
-          sudo apt-get update && sudo apt-get -y install libcurl4-openssl-dev libharfbuzz-dev libfribidi-dev
+        uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          pak-version: devel
 
       - name: Install system dependencies (macOS)
         if: runner.os == 'macOS'

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: einprot
 Type: Package
 Title: A collection of proteomics analysis utilities and workflows
-Version: 0.9.4
+Version: 0.9.5
 Authors@R: c(
     person("Charlotte", "Soneson", email = "[email protected]", 
            role = c("aut", "cre"), comment = c(ORCID = "0000-0003-3833-2169")),
@@ -41,7 +41,7 @@ Imports:
     MsCoreUtils,
     msigdbr,
     plotly,
-    QFeatures,
+    QFeatures (>= 1.14.0),
     readr,
     rlang,
     rmarkdown,
@@ -78,7 +78,7 @@ Imports:
     grid,
     Biostrings,
     motifStack
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Suggests: 
     BiocManager,
     testthat (>= 3.0.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -11,6 +11,7 @@ export(doNormalization)
 export(doPCA)
 export(emptySampleText)
 export(expDesignText)
+export(filterByModText)
 export(filterFragPipe)
 export(filterMaxQuant)
 export(filterPDTMT)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# einprot 0.9.5
+
+* Add filtering by score and number of peptides to Spectronaut workflow
+* Filter by modifications after the normalization in PD-TMT peptide groups workflow
+
 # einprot 0.9.4
 
 * Add details about DIA-NN command line to report

diff --git a/R/doFilter.R b/R/doFilter.R
@@ -529,21 +529,27 @@ filterFragPipe <- function(sce, minPeptides, plotUpset = TRUE,
 
 #' Filter out features in Spectronaut data
 #'
-#' Exclude features where the 'PG.ProteinGroups' column ends with the
+#' Exclude features with 'PG.Cscore' below \code{minScore},
+#' 'PG.NrOfStrippedSequencesIdentified.Experiment.wide' below
+#' \code{minPeptides}, or where the 'PG.ProteinGroups' column contains the
 #' specified \code{revPattern}.
 #'
 #' @author Charlotte Soneson
 #' @export
 #'
 #' @param sce A \code{SummarizedExperiment} object (or a derivative).
+#' @param minScore Numeric scalar, the minimum allowed value in the 'PG.Cscore'
+#'     column in order to retain the feature.
 #' @param minPeptides Numeric scalar, the minimum allowed value in the
-#'     'Combined.Total.Peptides' column in order to retain the feature.
+#'     'PG.NrOfStrippedSequencesIdentified.Experiment.wide' column in order to
+#'     retain the feature.
 #' @param plotUpset Logical scalar, whether to generate an UpSet plot
 #'     detailing the reasons for features being filtered out. Only
 #'     generated if any feature is in fact filtered out.
 #' @param revPattern Character scalar providing the pattern (a regular
 #'     expression) used to identify decoys (reverse hits). The pattern is
-#'     matched against the IDs in the FragPipe \code{Protein} column.
+#'     matched against the IDs in the Spectronaut \code{PG.ProteinGroups}
+#'     column.
 #' @param exclFile Character scalar, the path to a text file where the
 #'     features that are filtered out are written. If \code{NULL} (default),
 #'     excluded features are not recorded.
@@ -555,9 +561,10 @@ filterFragPipe <- function(sce, minPeptides, plotUpset = TRUE,
 #' @importFrom ComplexUpset upset
 #' @importFrom rlang .data
 #'
-filterSpectronaut <- function(sce, minPeptides, plotUpset = TRUE,
+filterSpectronaut <- function(sce, minScore, minPeptides, plotUpset = TRUE,
                               revPattern = "_Decoy$", exclFile = NULL) {
     .assertVector(x = sce, type = "SummarizedExperiment")
+    .assertScalar(x = minScore, type = "numeric", allowNULL = TRUE)
     .assertScalar(x = minPeptides, type = "numeric", allowNULL = TRUE)
     .assertScalar(x = plotUpset, type = "logical")
     .assertScalar(x = revPattern, type = "character")
@@ -568,19 +575,27 @@ filterSpectronaut <- function(sce, minPeptides, plotUpset = TRUE,
                                    "+", "")
 
     filtdf <- as.data.frame(SummarizedExperiment::rowData(sce)) %>%
-        dplyr::select(dplyr::any_of(c("Reverse"))) %>%
+        dplyr::select(dplyr::any_of(c("Reverse", "PG.NrOfStrippedSequencesIdentified.Experiment.wide",
+                                      "PG.Cscore"))) %>%
         dplyr::mutate(across(dplyr::any_of(c("Reverse")),
                              function(x) as.numeric(x == "+")))
-    # if ("Combined.Total.Peptides" %in% colnames(filtdf) &&
-    #     !is.null(minPeptides)) {
-    #     filtdf <- filtdf %>%
-    #         dplyr::mutate(
-    #             Combined.Total.Peptides = as.numeric(
-    #                 (.data$Combined.Total.Peptides < minPeptides) |
-    #                     is.na(.data$Combined.Total.Peptides)))
-    # } else {
-    #     filtdf$Combined.Total.Peptides <- NULL
-    # }
+    if ("PG.NrOfStrippedSequencesIdentified.Experiment.wide" %in% colnames(filtdf) &&
+        !is.null(minPeptides)) {
+        filtdf <- filtdf %>%
+            dplyr::mutate(
+                PG.NrOfStrippedSequencesIdentified.Experiment.wide = as.numeric(
+                    (.data$PG.NrOfStrippedSequencesIdentified.Experiment.wide < minPeptides) |
+                        is.na(.data$PG.NrOfStrippedSequencesIdentified.Experiment.wide)))
+    } else {
+        filtdf$PG.NrOfStrippedSequencesIdentified.Experiment.wide <- NULL
+    }
+    if ("PG.Cscore" %in% colnames(filtdf) && !is.null(minScore)) {
+        filtdf <- filtdf %>%
+            dplyr::mutate(PG.Cscore = as.numeric((.data$PG.Cscore < minScore) |
+                                                     is.na(.data$PG.Cscore)))
+    } else {
+        filtdf$PG.Cscore <- NULL
+    }
 
     keep <- seq_len(nrow(sce))
     if ("Reverse" %in% colnames(rowData(sce))) {
@@ -589,12 +604,15 @@ filterSpectronaut <- function(sce, minPeptides, plotUpset = TRUE,
     # if ("Potential.contaminant" %in% colnames(rowData(sce))) {
     #     keep <- intersect(keep, which(rowData(sce)$Potential.contaminant == ""))
     # }
-    # if ("Combined.Total.Peptides" %in% colnames(rowData(sce)) &&
-    #     !is.null(minPeptides)) {
-    #     keep <- intersect(
-    #         keep, which(rowData(sce)$Combined.Total.Peptides >= minPeptides)
-    #     )
-    # }
+    if ("PG.NrOfStrippedSequencesIdentified.Experiment.wide" %in% colnames(rowData(sce)) &&
+        !is.null(minPeptides)) {
+        keep <- intersect(
+            keep, which(rowData(sce)$PG.NrOfStrippedSequencesIdentified.Experiment.wide >= minPeptides)
+        )
+    }
+    if ("PG.Cscore" %in% colnames(rowData(sce)) && !is.null(minScore)) {
+        keep <- intersect(keep, which(rowData(sce)$PG.Cscore >= minScore))
+    }
     exclude <- rowData(sce[setdiff(seq_len(nrow(sce)), keep), ])
     sce <- sce[keep, ]
 

diff --git a/R/importDIANN.R b/R/importDIANN.R
@@ -72,7 +72,7 @@ importDIANN <- function(inFile, fileType = "pg_matrix", outLevel = "pg",
                            stopIfEmpty = stopIfEmpty)
 
         sce <- QFeatures::readSummarizedExperiment(
-            inFile, ecol = iCols, sep = "\t", check.names = FALSE, ...
+            inFile, quantCols = iCols, sep = "\t", check.names = FALSE, ...
         )
 
         SummarizedExperiment::assayNames(sce) <- aName

diff --git a/R/importExperiment.R b/R/importExperiment.R
@@ -294,7 +294,7 @@ importExperiment <- function(inFile, iColPattern, includeOnlySamples = "",
 
         if (length(icols) > 0) {
             se <- QFeatures::readSummarizedExperiment(
-                inFile, ecol = icols, sep = "\t", ...
+                inFile, quantCols = icols, sep = "\t", ...
             )
             ## Add list of columns to metadata
             S4Vectors::metadata(se)$cols <- icols

diff --git a/R/runMaxQuantAnalysis.R b/R/runMaxQuantAnalysis.R
@@ -71,7 +71,9 @@
 #'     retained in the analysis. Set to \code{NULL} if no filtering on the
 #'     number of peptides is desired.
 #' @param imputeMethod Character string defining the imputation method to use.
-#'     Currently, \code{"impSeqRob"} and \code{"MinProb"} are supported.
+#'     Currently, \code{"impSeqRob"}, \code{"MinProb"}, and
+#'     \code{"MinProbGlobal"} are supported. See \code{\link{doImputation}} for
+#'     more details about the methods.
 #' @param assaysForExport Character vector defining the name(s) of the assays
 #'     to use for exported abundances and barplots. This could, for example,
 #'     be set to an assay containing 'absolute' abundances, if available, even
@@ -161,7 +163,7 @@
 #' @param seed Numeric, random seed to use for any non-deterministic
 #'     calculations.
 #' @param includeFeatureCollections Character vector, a subset of
-#'     \code{c("complexes", "GO")}.
+#'     \code{c("complexes", "GO", "pathways")}.
 #' @param minSizeToKeepSet Numeric scalar indicating the smallest number of
 #'     features that have to overlap with the current data set in order to
 #'     retain a feature set for testing.

diff --git a/R/textSnippets.R b/R/textSnippets.R
@@ -231,6 +231,24 @@ introText <- function(expType) {
     }
 }
 
+#' @rdname textSnippets
+#' @export
+filterByModText <- function(excludeUnmodifiedPeptides, keepModifications) {
+    if (excludeUnmodifiedPeptides && !is.null(keepModifications)) {
+        paste0("Next, we filter out unmodified peptides and peptides ",
+               "without any of the requested modifications ",
+               "(", paste(keepModifications, collapse = ", "), ").")
+    } else if (excludeUnmodifiedPeptides) {
+        paste0("Next, we filter out unmodified peptides.")
+    } else if (!is.null(keepModifications)) {
+        paste0("Next, we filter out peptides ",
+               "without any of the requested modifications ",
+               "(", paste(keepModifications, collapse = ", "), ").")
+    } else {
+        ""
+    }
+}
+
 #' @rdname textSnippets
 #' @export
 inputText <- function(expTypeLevel) {

diff --git a/inst/extdata/einprot_bibliography.bib b/inst/extdata/einprot_bibliography.bib
@@ -41,7 +41,7 @@ @ARTICLE{AhlmannEltze2020proda
   title    = "{proDA}: Probabilistic Dropout Analysis for Identifying
               Differentially Abundant Proteins in {Label-Free} Mass
               Spectrometry",
-  author   = "Ahlmann-Eltze, Constantin and Anders, Simon",
+  author   = "Ahlmann-Eltze, C and Anders, S",
   journal  = "bioRxiv doi:https://doi.org/10.1101/661496",
   year     =  2020
 }
@@ -140,6 +140,19 @@ @ARTICLE{Cox2008maxquant
   url      = "https://www.nature.com/articles/nbt.1511"
 }
 
+@ARTICLE{Soneson2023einprot,
+  title     = "einprot: Flexible, easy-to-use, reproducible workflows for
+               statistical analysis of quantitative proteomics data",
+  author    = "Soneson, C and Iesmantavicius, V and Hess, D
+               and Stadler, MB and Seebacher, J",
+  journal   = "J. Open Source Softw.",
+  volume    =  8,
+  number    =  89,
+  pages     =  5750,
+  url       = "https://doi.org/10.21105/joss.05750",
+  year      =  2023
+}
+
 @ARTICLE{Orsburn2021pd,
   title    = "Proteome {Discoverer-A} Community Enhanced Data Processing Suite
               for Protein Informatics",
@@ -153,8 +166,8 @@ @ARTICLE{Orsburn2021pd
 
 @ARTICLE{Rue-Albrecht2018isee,
   title    = "{iSEE}: Interactive {SummarizedExperiment} Explorer",
-  author   = "Rue-Albrecht, Kevin and Marini, Federico and Soneson, Charlotte
-              and Lun, Aaron T L",
+  author   = "Rue-Albrecht, K and Marini, F and Soneson, C
+              and Lun, ATL",
   journal  = "F1000Res.",
   volume   =  7,
   pages    =  741,
@@ -165,7 +178,7 @@ @ARTICLE{Rue-Albrecht2018isee
 @ARTICLE{BenjaminiHochberg1995fdr,
   title   = "Controlling the false discovery rate: a practical and powerful
              approach to multiple testing",
-  author  = "Benjamini, Yoav and Hochberg, Yosef",
+  author  = "Benjamini, Y and Hochberg, Y",
   journal = "J. R. Stat. Soc. Series B Stat. Methodol.",
   volume  =  57,
   number  =  1,
@@ -176,8 +189,8 @@ @ARTICLE{BenjaminiHochberg1995fdr
 @ARTICLE{Demichev2020diann,
   title    = "{DIA-NN}: neural networks and interference correction enable deep
               proteome coverage in high throughput",
-  author   = "Demichev, Vadim and Messner, Christoph B and Vernardis, Spyros I
-              and Lilley, Kathryn S and Ralser, Markus",
+  author   = "Demichev, V and Messner, CB and Vernardis, SI
+              and Lilley, KS and Ralser, M",
   journal  = "Nat. Methods",
   volume   =  17,
   number   =  1,

diff --git a/inst/extdata/process_PD_TMT_PTM_template.Rmd b/inst/extdata/process_PD_TMT_PTM_template.Rmd
@@ -821,10 +821,13 @@ That will open up an iSEE session where you can interactively explore your data.
 imptd <- sub("imputed_", "", 
              grep("imputed_", SummarizedExperiment::assayNames(scePeptides), 
                   value = TRUE))
+hmFeature <- rownames(scePeptides)[
+    min(which(rowSums(!is.na(assay(scePeptides, assayForTests))) > 0))]
 makeiSEEScript(iSEEScript = iSEEScript, sceFile = sceFile,
                aName = imptd, tests = tests,
                assayForPlots = assayForTests,
                assayForHeatmaps = assayForTests,
+               featureForHeatmaps = hmFeature, 
                includeFeatureSetTable = FALSE)
 ```