Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add filtering for Spectronaut #23

Merged
merged 13 commits into from
Jul 21, 2024
11 changes: 5 additions & 6 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
config:
- { os: macOS-latest, bioc: 'release', curlConfigPath: '/usr/bin/'}
- { os: windows-latest, bioc: 'release'}
- { os: ubuntu-latest, image: "bioconductor/bioconductor_docker:RELEASE_3_18", cran: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"}
- { os: ubuntu-latest, image: "bioconductor/bioconductor_docker:RELEASE_3_19", cran: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"}

env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
Expand Down Expand Up @@ -68,11 +68,10 @@ jobs:
if: runner.os == 'Linux'
env:
RHUB_PLATFORM: linux-x86_64-ubuntu-gcc
run: |
Rscript -e "remotes::install_github('r-hub/sysreqs')"
sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))")
sudo -s eval "$sysreqs"
sudo apt-get update && sudo apt-get -y install libcurl4-openssl-dev libharfbuzz-dev libfribidi-dev
uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::rcmdcheck
pak-version: devel

- name: Install system dependencies (macOS)
if: runner.os == 'macOS'
Expand Down
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: einprot
Type: Package
Title: A collection of proteomics analysis utilities and workflows
Version: 0.9.4
Version: 0.9.5
Authors@R: c(
person("Charlotte", "Soneson", email = "[email protected]",
role = c("aut", "cre"), comment = c(ORCID = "0000-0003-3833-2169")),
Expand Down Expand Up @@ -41,7 +41,7 @@ Imports:
MsCoreUtils,
msigdbr,
plotly,
QFeatures,
QFeatures (>= 1.14.0),
readr,
rlang,
rmarkdown,
Expand Down Expand Up @@ -78,7 +78,7 @@ Imports:
grid,
Biostrings,
motifStack
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Suggests:
BiocManager,
testthat (>= 3.0.0),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export(doNormalization)
export(doPCA)
export(emptySampleText)
export(expDesignText)
export(filterByModText)
export(filterFragPipe)
export(filterMaxQuant)
export(filterPDTMT)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# einprot 0.9.5

* Add filtering by score and number of peptides to Spectronaut workflow
* Filter by modifications after the normalization in PD-TMT peptide groups workflow

# einprot 0.9.4

* Add details about DIA-NN command line to report
Expand Down
60 changes: 39 additions & 21 deletions R/doFilter.R
Original file line number Diff line number Diff line change
Expand Up @@ -529,21 +529,27 @@ filterFragPipe <- function(sce, minPeptides, plotUpset = TRUE,

#' Filter out features in Spectronaut data
#'
#' Exclude features where the 'PG.ProteinGroups' column ends with the
#' Exclude features with 'PG.Cscore' below \code{minScore},
#' 'PG.NrOfStrippedSequencesIdentified.Experiment.wide' below
#' \code{minPeptides}, or where the 'PG.ProteinGroups' column contains the
#' specified \code{revPattern}.
#'
#' @author Charlotte Soneson
#' @export
#'
#' @param sce A \code{SummarizedExperiment} object (or a derivative).
#' @param minScore Numeric scalar, the minimum allowed value in the 'PG.Cscore'
#' column in order to retain the feature.
#' @param minPeptides Numeric scalar, the minimum allowed value in the
#' 'Combined.Total.Peptides' column in order to retain the feature.
#' 'PG.NrOfStrippedSequencesIdentified.Experiment.wide' column in order to
#' retain the feature.
#' @param plotUpset Logical scalar, whether to generate an UpSet plot
#' detailing the reasons for features being filtered out. Only
#' generated if any feature is in fact filtered out.
#' @param revPattern Character scalar providing the pattern (a regular
#' expression) used to identify decoys (reverse hits). The pattern is
#' matched against the IDs in the FragPipe \code{Protein} column.
#' matched against the IDs in the Spectronaut \code{PG.ProteinGroups}
#' column.
#' @param exclFile Character scalar, the path to a text file where the
#' features that are filtered out are written. If \code{NULL} (default),
#' excluded features are not recorded.
Expand All @@ -555,9 +561,10 @@ filterFragPipe <- function(sce, minPeptides, plotUpset = TRUE,
#' @importFrom ComplexUpset upset
#' @importFrom rlang .data
#'
filterSpectronaut <- function(sce, minPeptides, plotUpset = TRUE,
filterSpectronaut <- function(sce, minScore, minPeptides, plotUpset = TRUE,
revPattern = "_Decoy$", exclFile = NULL) {
.assertVector(x = sce, type = "SummarizedExperiment")
.assertScalar(x = minScore, type = "numeric", allowNULL = TRUE)
.assertScalar(x = minPeptides, type = "numeric", allowNULL = TRUE)
.assertScalar(x = plotUpset, type = "logical")
.assertScalar(x = revPattern, type = "character")
Expand All @@ -568,19 +575,27 @@ filterSpectronaut <- function(sce, minPeptides, plotUpset = TRUE,
"+", "")

filtdf <- as.data.frame(SummarizedExperiment::rowData(sce)) %>%
dplyr::select(dplyr::any_of(c("Reverse"))) %>%
dplyr::select(dplyr::any_of(c("Reverse", "PG.NrOfStrippedSequencesIdentified.Experiment.wide",
"PG.Cscore"))) %>%
dplyr::mutate(across(dplyr::any_of(c("Reverse")),
function(x) as.numeric(x == "+")))
# if ("Combined.Total.Peptides" %in% colnames(filtdf) &&
# !is.null(minPeptides)) {
# filtdf <- filtdf %>%
# dplyr::mutate(
# Combined.Total.Peptides = as.numeric(
# (.data$Combined.Total.Peptides < minPeptides) |
# is.na(.data$Combined.Total.Peptides)))
# } else {
# filtdf$Combined.Total.Peptides <- NULL
# }
if ("PG.NrOfStrippedSequencesIdentified.Experiment.wide" %in% colnames(filtdf) &&
!is.null(minPeptides)) {
filtdf <- filtdf %>%
dplyr::mutate(
PG.NrOfStrippedSequencesIdentified.Experiment.wide = as.numeric(
(.data$PG.NrOfStrippedSequencesIdentified.Experiment.wide < minPeptides) |
is.na(.data$PG.NrOfStrippedSequencesIdentified.Experiment.wide)))
} else {
filtdf$PG.NrOfStrippedSequencesIdentified.Experiment.wide <- NULL
}
if ("PG.Cscore" %in% colnames(filtdf) && !is.null(minScore)) {
filtdf <- filtdf %>%
dplyr::mutate(PG.Cscore = as.numeric((.data$PG.Cscore < minScore) |
is.na(.data$PG.Cscore)))
} else {
filtdf$PG.Cscore <- NULL
}

keep <- seq_len(nrow(sce))
if ("Reverse" %in% colnames(rowData(sce))) {
Expand All @@ -589,12 +604,15 @@ filterSpectronaut <- function(sce, minPeptides, plotUpset = TRUE,
# if ("Potential.contaminant" %in% colnames(rowData(sce))) {
# keep <- intersect(keep, which(rowData(sce)$Potential.contaminant == ""))
# }
# if ("Combined.Total.Peptides" %in% colnames(rowData(sce)) &&
# !is.null(minPeptides)) {
# keep <- intersect(
# keep, which(rowData(sce)$Combined.Total.Peptides >= minPeptides)
# )
# }
if ("PG.NrOfStrippedSequencesIdentified.Experiment.wide" %in% colnames(rowData(sce)) &&
!is.null(minPeptides)) {
keep <- intersect(
keep, which(rowData(sce)$PG.NrOfStrippedSequencesIdentified.Experiment.wide >= minPeptides)
)
}
if ("PG.Cscore" %in% colnames(rowData(sce)) && !is.null(minScore)) {
keep <- intersect(keep, which(rowData(sce)$PG.Cscore >= minScore))
}
exclude <- rowData(sce[setdiff(seq_len(nrow(sce)), keep), ])
sce <- sce[keep, ]

Expand Down
2 changes: 1 addition & 1 deletion R/importDIANN.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ importDIANN <- function(inFile, fileType = "pg_matrix", outLevel = "pg",
stopIfEmpty = stopIfEmpty)

sce <- QFeatures::readSummarizedExperiment(
inFile, ecol = iCols, sep = "\t", check.names = FALSE, ...
inFile, quantCols = iCols, sep = "\t", check.names = FALSE, ...
)

SummarizedExperiment::assayNames(sce) <- aName
Expand Down
2 changes: 1 addition & 1 deletion R/importExperiment.R
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ importExperiment <- function(inFile, iColPattern, includeOnlySamples = "",

if (length(icols) > 0) {
se <- QFeatures::readSummarizedExperiment(
inFile, ecol = icols, sep = "\t", ...
inFile, quantCols = icols, sep = "\t", ...
)
## Add list of columns to metadata
S4Vectors::metadata(se)$cols <- icols
Expand Down
6 changes: 4 additions & 2 deletions R/runMaxQuantAnalysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@
#' retained in the analysis. Set to \code{NULL} if no filtering on the
#' number of peptides is desired.
#' @param imputeMethod Character string defining the imputation method to use.
#' Currently, \code{"impSeqRob"} and \code{"MinProb"} are supported.
#' Currently, \code{"impSeqRob"}, \code{"MinProb"}, and
#' \code{"MinProbGlobal"} are supported. See \code{\link{doImputation}} for
#' more details about the methods.
#' @param assaysForExport Character vector defining the name(s) of the assays
#' to use for exported abundances and barplots. This could, for example,
#' be set to an assay containing 'absolute' abundances, if available, even
Expand Down Expand Up @@ -161,7 +163,7 @@
#' @param seed Numeric, random seed to use for any non-deterministic
#' calculations.
#' @param includeFeatureCollections Character vector, a subset of
#' \code{c("complexes", "GO")}.
#' \code{c("complexes", "GO", "pathways")}.
#' @param minSizeToKeepSet Numeric scalar indicating the smallest number of
#' features that have to overlap with the current data set in order to
#' retain a feature set for testing.
Expand Down
18 changes: 18 additions & 0 deletions R/textSnippets.R
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,24 @@ introText <- function(expType) {
}
}

#' @rdname textSnippets
#' @export
filterByModText <- function(excludeUnmodifiedPeptides, keepModifications) {
if (excludeUnmodifiedPeptides && !is.null(keepModifications)) {
paste0("Next, we filter out unmodified peptides and peptides ",
"without any of the requested modifications ",
"(", paste(keepModifications, collapse = ", "), ").")
} else if (excludeUnmodifiedPeptides) {
paste0("Next, we filter out unmodified peptides.")
} else if (!is.null(keepModifications)) {
paste0("Next, we filter out peptides ",
"without any of the requested modifications ",
"(", paste(keepModifications, collapse = ", "), ").")
} else {
""
}
}

#' @rdname textSnippets
#' @export
inputText <- function(expTypeLevel) {
Expand Down
25 changes: 19 additions & 6 deletions inst/extdata/einprot_bibliography.bib
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ @ARTICLE{AhlmannEltze2020proda
title = "{proDA}: Probabilistic Dropout Analysis for Identifying
Differentially Abundant Proteins in {Label-Free} Mass
Spectrometry",
author = "Ahlmann-Eltze, Constantin and Anders, Simon",
author = "Ahlmann-Eltze, C and Anders, S",
journal = "bioRxiv doi:https://doi.org/10.1101/661496",
year = 2020
}
Expand Down Expand Up @@ -140,6 +140,19 @@ @ARTICLE{Cox2008maxquant
url = "https://www.nature.com/articles/nbt.1511"
}

@ARTICLE{Soneson2023einprot,
title = "einprot: Flexible, easy-to-use, reproducible workflows for
statistical analysis of quantitative proteomics data",
author = "Soneson, C and Iesmantavicius, V and Hess, D
and Stadler, MB and Seebacher, J",
journal = "J. Open Source Softw.",
volume = 8,
number = 89,
pages = 5750,
url = "https://doi.org/10.21105/joss.05750",
year = 2023
}

@ARTICLE{Orsburn2021pd,
title = "Proteome {Discoverer-A} Community Enhanced Data Processing Suite
for Protein Informatics",
Expand All @@ -153,8 +166,8 @@ @ARTICLE{Orsburn2021pd

@ARTICLE{Rue-Albrecht2018isee,
title = "{iSEE}: Interactive {SummarizedExperiment} Explorer",
author = "Rue-Albrecht, Kevin and Marini, Federico and Soneson, Charlotte
and Lun, Aaron T L",
author = "Rue-Albrecht, K and Marini, F and Soneson, C
and Lun, ATL",
journal = "F1000Res.",
volume = 7,
pages = 741,
Expand All @@ -165,7 +178,7 @@ @ARTICLE{Rue-Albrecht2018isee
@ARTICLE{BenjaminiHochberg1995fdr,
title = "Controlling the false discovery rate: a practical and powerful
approach to multiple testing",
author = "Benjamini, Yoav and Hochberg, Yosef",
author = "Benjamini, Y and Hochberg, Y",
journal = "J. R. Stat. Soc. Series B Stat. Methodol.",
volume = 57,
number = 1,
Expand All @@ -176,8 +189,8 @@ @ARTICLE{BenjaminiHochberg1995fdr
@ARTICLE{Demichev2020diann,
title = "{DIA-NN}: neural networks and interference correction enable deep
proteome coverage in high throughput",
author = "Demichev, Vadim and Messner, Christoph B and Vernardis, Spyros I
and Lilley, Kathryn S and Ralser, Markus",
author = "Demichev, V and Messner, CB and Vernardis, SI
and Lilley, KS and Ralser, M",
journal = "Nat. Methods",
volume = 17,
number = 1,
Expand Down
3 changes: 3 additions & 0 deletions inst/extdata/process_PD_TMT_PTM_template.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -821,10 +821,13 @@ That will open up an iSEE session where you can interactively explore your data.
imptd <- sub("imputed_", "",
grep("imputed_", SummarizedExperiment::assayNames(scePeptides),
value = TRUE))
hmFeature <- rownames(scePeptides)[
min(which(rowSums(!is.na(assay(scePeptides, assayForTests))) > 0))]
makeiSEEScript(iSEEScript = iSEEScript, sceFile = sceFile,
aName = imptd, tests = tests,
assayForPlots = assayForTests,
assayForHeatmaps = assayForTests,
featureForHeatmaps = hmFeature,
includeFeatureSetTable = FALSE)
```

Expand Down
Loading
Loading