Merge pull request #5 from fmicompbio/add-fragpipe

Add FragPipe
fmicompbio · Mar 5, 2023 · adc1c8a · adc1c8a
2 parents 2910869 + 7c05481
commit adc1c8a
Show file tree

Hide file tree

Showing 65 changed files with 5,489 additions and 832 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: einprot
 Type: Package
 Title: A collection of proteomics analysis utilities and workflows
-Version: 0.6.5
+Version: 0.6.8
 Authors@R: c(
     person("Charlotte", "Soneson", email = "[email protected]", 
            role = c("aut", "cre"), comment = c(ORCID = "0000-0003-3833-2169")),
@@ -74,6 +74,7 @@ RoxygenNote: 7.2.3
 Suggests: 
     BiocManager,
     testthat (>= 3.0.0),
-    iSEE
+    iSEE,
+    matrixStats
 Config/testthat/edition: 3
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -20,6 +20,8 @@ export(getConvTable)
 export(getFirstId)
 export(getIntensityColumns)
 export(getMatSubtractedBaseline)
+export(getMaxMissedCleavagesFrompdAnalysis)
+export(getNthId)
 export(getPSMValidationInfoFrompdAnalysis)
 export(getQuantInfoFrompdAnalysis)
 export(getQuantOrderFrompdAnalysis)
@@ -48,8 +50,10 @@ export(plotPDTMTqc)
 export(plotVolcano)
 export(prepareFeatureCollections)
 export(prepareFinalSCE)
+export(readFragPipeInfo)
 export(readMaxQuantXML)
 export(readProteomeDiscovererInfo)
+export(runFragPipeAnalysis)
 export(runMaxQuantAnalysis)
 export(runPDTMTAnalysis)
 export(runPDTMTptmAnalysis)
@@ -96,7 +100,6 @@ importFrom(dplyr,arrange)
 importFrom(dplyr,between)
 importFrom(dplyr,bind_cols)
 importFrom(dplyr,bind_rows)
-importFrom(dplyr,case_when)
 importFrom(dplyr,contains)
 importFrom(dplyr,desc)
 importFrom(dplyr,distinct)
@@ -134,7 +137,9 @@ importFrom(ggplot2,element_blank)
 importFrom(ggplot2,element_line)
 importFrom(ggplot2,element_text)
 importFrom(ggplot2,expand_limits)
+importFrom(ggplot2,facet_grid)
 importFrom(ggplot2,facet_wrap)
+importFrom(ggplot2,geom_abline)
 importFrom(ggplot2,geom_bar)
 importFrom(ggplot2,geom_boxplot)
 importFrom(ggplot2,geom_col)
@@ -153,6 +158,7 @@ importFrom(ggplot2,ggtitle)
 importFrom(ggplot2,labs)
 importFrom(ggplot2,layer_data)
 importFrom(ggplot2,position_dodge)
+importFrom(ggplot2,position_jitterdodge)
 importFrom(ggplot2,rel)
 importFrom(ggplot2,scale_fill_continuous)
 importFrom(ggplot2,scale_fill_gradient2)
@@ -165,8 +171,6 @@ importFrom(ggplot2,theme)
 importFrom(ggplot2,theme_bw)
 importFrom(ggplot2,theme_minimal)
 importFrom(ggrepel,geom_text_repel)
-importFrom(grDevices,dev.off)
-importFrom(grDevices,pdf)
 importFrom(htmltools,tagList)
 importFrom(iSEEu,registerAveAbFields)
 importFrom(iSEEu,registerFeatureSetCollections)

diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,43 @@
+# einprot 0.6.8
+
+* Add idCol and labelCol arguments to PTM workflow
+* Change default behaviour of fixFeatureIds when column name is given to not make output unique
+* Add stringVersion and stringDir arguments, allowing the use of local STRING files
+* Make filter functions more robust to missing columns
+* Add possibility to write excluded features to a file
+* Represent link table columns as integers/factors when appropriate
+* Allow displaying any column in rowData(sce) in the interactive volcano plot tooltip
+* In the case of long labels, attempt to auto-adapt text size in PCA coefficient and logFC plots
+* Add overview and crosslinks in the beginning of the reports
+* Include bar plot of significant features in pdf output
+* Allow iColPattern without Sample for PDTMT
+* Make plot axis labels less ambiguous
+
+# einprot 0.6.7
+
+* Add signifDigits argument to makeDbLinkTable, and round to 4 significant digits in the templates
+* Include the maximum number of missed cleavages in PDTMT tables
+* Let maxNbrComplexesToPlot determine also the maximum number of top feature sets displayed in the reports
+* Bugfix for sample plot ordering in complex bar plots
+* Add bar plot for significant features
+
+# einprot 0.6.6
+
+* Bring FragPipe workflows up to date
+* Add individual PTM volcano plots to table of content
+* Add modificationsCol and keepModifications arguments to PTM workflow
+* Increase control level in deparsing to allow e.g. multi-line functions
+* Fill feature sets plot by direction
+* Suppress legend in PCA plot if there are too many groups
+* Add option to only retain master proteins in PDTMT Proteins workflow
+* Allow inclusion of extra columns in the link table
+
 # einprot 0.6.5
 
 * Change correlation heatmap appearance
 * Provide the possibility to run the workflows without statistical tests
 * Allow specifying the iColPattern without escaped periods
+* Add interactiveDisplayColumns arguments to volcano plots
 
 # einprot 0.6.4
 

diff --git a/R/checkArgumentsFragPipe.R b/R/checkArgumentsFragPipe.R
@@ -0,0 +1,201 @@
+#' Check validity of arguments for FragPipe analysis
+#'
+#' @keywords internal
+#' @noRd
+#' @author Charlotte Soneson
+#'
+#' @importFrom MsCoreUtils normalizeMethods
+.checkArgumentsFragPipe <- function(
+    templateRmd, outputDir, outputBaseName, reportTitle, reportAuthor,
+    forceOverwrite, experimentInfo, species, fragpipeDir,
+    idCol, labelCol, geneIdCol, proteinIdCol, stringIdCol,
+    iColPattern, sampleAnnot, includeOnlySamples,
+    excludeSamples, minScore, minPeptides, imputeMethod, mergeGroups,
+    comparisons, ctrlGroup, allPairwiseComparisons, singleFit,
+    subtractBaseline, baselineGroup, normMethod, spikeFeatures, stattest,
+    minNbrValidValues, minlFC, samSignificance, nperm, volcanoAdjPvalThr,
+    volcanoLog2FCThr, volcanoMaxFeatures, volcanoS0, volcanoFeaturesToLabel,
+    addInteractiveVolcanos, interactiveDisplayColumns, complexFDRThr,
+    maxNbrComplexesToPlot, seed,
+    includeFeatureCollections, minSizeToKeepSet, customComplexes,
+    complexSpecies, complexDbPath, stringVersion, stringDir, linkTableColumns,
+    customYml, doRender
+) {
+    ## templateRmd
+    .assertScalar(x = templateRmd, type = "character")
+    if (!file.exists(templateRmd)) {
+        stop("'templateRmd' must point to an existing file")
+    }
+
+    ## Output specifications
+    .assertScalar(x = outputDir, type = "character")
+    .assertScalar(x = outputBaseName, type = "character")
+    .assertScalar(x = reportTitle, type = "character")
+    .assertScalar(x = reportAuthor, type = "character")
+    .assertScalar(x = forceOverwrite, type = "logical")
+    .assertScalar(x = doRender, type = "logical")
+
+    ## Experiment info
+    .assertVector(x = experimentInfo, type = "list")
+    if (length(experimentInfo) > 0) {
+        .assertVector(x = names(experimentInfo), type = "character")
+    }
+    tmp <- getSpeciesInfo(species) ## gives an error for unsupported species
+
+    ## FP files
+    .assertScalar(x = fragpipeDir, type = "character")
+    if (!file.exists(file.path(fragpipeDir, "combined_protein.tsv"))) {
+        stop("The file ",
+             file.path(fragpipeDir, "combined_protein.tsv"),
+             " doesn't exist")
+    }
+    fpConfigFile <- list.files(fragpipeDir, pattern = "^fragpipe.+.config$",
+                               full.names = TRUE)
+    if (length(fpConfigFile) > 1) {
+        stop("There are more than one config file in the FragPipe directory")
+    }
+    fpLogFile <- list.files(fragpipeDir, pattern = "^log_.+.txt$",
+                            full.names = TRUE)
+    if (length(fpLogFile) > 1) {
+        stop("There are more than one log file in the FragPipe directory")
+    }
+
+    ## Samples to include or exclude
+    .assertVector(x = includeOnlySamples, type = "character")
+    .assertVector(x = excludeSamples, type = "character")
+    if ((length(includeOnlySamples) > 1 || includeOnlySamples != "") &&
+        (length(excludeSamples) > 1 || excludeSamples != "")) {
+        stop("Please specify max one of includeOnlySamples and excludeSamples")
+    }
+
+    ## Names and patterns
+    validPatterns <- c("\\\\.Unique\\\\.Spectral\\\\.Count$",
+                       "\\\\.Total\\\\.Spectral\\\\.Count$",
+                       "\\\\.Unique\\\\.Intensity$",
+                       "\\\\.MaxLFQ\\\\.Unique\\\\.Intensity$",
+                       "\\\\.MaxLFQ\\\\.Total\\\\.Intensity$",
+                       "\\\\.MaxLFQ\\\\.Intensity$")
+    .assertScalar(x = iColPattern, type = "character",
+                  validValues = c(validPatterns,
+                                  gsub("\\\\", "", validPatterns, fixed = TRUE)))
+    .assertVector(x = sampleAnnot, type = "data.frame")
+    .assertVector(x = colnames(sampleAnnot), type = "character")
+    stopifnot(all(c("sample", "group") %in% colnames(sampleAnnot)))
+    .assertVector(x = sampleAnnot$group, type = "character")
+    ics <- getIntensityColumns(inFile = file.path(fragpipeDir,
+                                                  "combined_protein.tsv"),
+                               iColPattern = gsub("\\\\", "\\", iColPattern,
+                                                  fixed = TRUE),
+                               includeOnlySamples = includeOnlySamples,
+                               excludeSamples = excludeSamples,
+                               stopIfEmpty = TRUE)
+    ics <- gsub(gsub("\\\\", "\\", iColPattern,
+                     fixed = TRUE), "", ics$iCols)
+    msg <- setdiff(ics, sampleAnnot$sample)
+    if (length(msg) > 0) {
+        stop("Not all sample names are available in the sample annotation. ",
+             "Missing samples: ", paste(msg, collapse = ","))
+    }
+
+    if (is(idCol, "function")) {
+        stopifnot(length(formals(idCol)) == 1)
+    } else {
+        .assertVector(x = idCol, type = "character")
+    }
+    if (is(labelCol, "function")) {
+        stopifnot(length(formals(labelCol)) == 1)
+    } else {
+        .assertVector(x = labelCol, type = "character")
+    }
+    if (is(geneIdCol, "function")) {
+        stopifnot(length(formals(geneIdCol)) == 1)
+    } else {
+        .assertVector(x = geneIdCol, type = "character", allowNULL = TRUE)
+    }
+    if (is(proteinIdCol, "function")) {
+        stopifnot(length(formals(proteinIdCol)) == 1)
+    } else {
+        .assertVector(x = proteinIdCol, type = "character")
+    }
+    if (is(stringIdCol, "function")) {
+        stopifnot(length(formals(stringIdCol)) == 1)
+    } else {
+        .assertVector(x = stringIdCol, type = "character", allowNULL = TRUE)
+    }
+
+    .assertVector(x = linkTableColumns, type = "character", allowNULL = TRUE)
+
+    ## Score thresholds
+    .assertScalar(x = minScore, type = "numeric")
+    .assertScalar(x = minPeptides, type = "numeric")
+
+    ## Method choices
+    .assertScalar(x = imputeMethod, type = "character",
+                  validValues = c("impSeqRob", "MinProb"))
+    .assertScalar(x = normMethod, type = "character",
+                  validValues = c(MsCoreUtils::normalizeMethods(), "none"))
+    .assertVector(x = spikeFeatures, type = "character", allowNULL = TRUE)
+    .assertScalar(x = stattest, type = "character",
+                  validValues = c("limma", "ttest", "proDA", "none"))
+
+    ## Test parameters
+    .assertScalar(x = minNbrValidValues, type = "numeric", rngIncl = c(0, Inf))
+    .assertScalar(x = minlFC, type = "numeric", rngIncl = c(0, Inf))
+    .assertScalar(x = samSignificance, type = "logical")
+    .assertScalar(x = nperm, type = "numeric", rngIncl = c(1, Inf))
+    .assertScalar(x = volcanoAdjPvalThr, type = "numeric", rngIncl = c(0, 1))
+    .assertScalar(x = volcanoLog2FCThr, type = "numeric", rngIncl = c(0, Inf))
+    .assertScalar(x = volcanoMaxFeatures, type = "numeric", rngIncl = c(0, Inf))
+    .assertScalar(x = volcanoS0, type = "numeric", rngIncl = c(0, Inf))
+    .assertScalar(x = complexFDRThr, type = "numeric", rngIncl = c(0, 1))
+    .assertScalar(x = maxNbrComplexesToPlot, type = "numeric", rngIncl = c(0, Inf))
+    .assertScalar(x = minSizeToKeepSet, type = "numeric", rngIncl = c(0, Inf))
+    .assertVector(x = volcanoFeaturesToLabel, type = "character")
+    .assertVector(x = mergeGroups, type = "list")
+    .assertVector(x = comparisons, type = "list")
+    .assertScalar(x = ctrlGroup, type = "character")
+    .assertScalar(x = allPairwiseComparisons, type = "logical")
+    .assertScalar(x = addInteractiveVolcanos, type = "logical")
+    .assertVector(x = interactiveDisplayColumns, type = "character", allowNULL = TRUE)
+    .assertScalar(x = singleFit, type = "logical")
+    .assertScalar(x = subtractBaseline, type = "logical")
+    .assertScalar(x = baselineGroup, type = "character")
+
+    if (length(mergeGroups) > 0) {
+        if (is.null(names(mergeGroups)) || any(names(mergeGroups) == "") ||
+            any(duplicated(names(mergeGroups)))) {
+            stop("'mergeGroups' must be a named list, without duplicated names")
+        }
+    }
+
+    if (length(comparisons) > 0) {
+        if (!all(vapply(comparisons, length, 0) == 2)) {
+            stop("Each entry in 'comparisons' must have exactly two elements")
+        }
+    }
+
+    ## seed
+    .assertScalar(x = seed, type = "numeric", rngIncl = c(1, Inf))
+
+    ## Complexes
+    .assertVector(x = includeFeatureCollections, type = "character",
+                  validValues = c("complexes", "GO"), allowNULL = TRUE)
+    .assertVector(x = customComplexes, type = "list")
+    if (length(customComplexes) > 0) {
+        .assertVector(x = names(customComplexes), type = "character")
+    }
+    .assertScalar(x = complexSpecies, type = "character",
+                  validValues = c("current", "all"), allowNULL = TRUE)
+    .assertScalar(x = complexDbPath, type = "character", allowNULL = TRUE)
+    if (!is.null(complexDbPath) && !file.exists(complexDbPath)) {
+        stop("'complexDbPath' must point to an existing file")
+    }
+
+    .assertScalar(x = stringVersion, type = "character")
+    .assertScalar(x = stringDir, type = "character", allowNULL = TRUE)
+
+    .assertScalar(x = customYml, type = "character", allowNULL = TRUE)
+    if (!is.null(customYml) && !file.exists(customYml)) {
+        stop("'customYml' must point to an existing file")
+    }
+}
diff --git a/R/checkArgumentsMaxQuant.R b/R/checkArgumentsMaxQuant.R
@@ -18,7 +18,8 @@
     addInteractiveVolcanos, interactiveDisplayColumns, complexFDRThr,
     maxNbrComplexesToPlot, seed,
     includeFeatureCollections, minSizeToKeepSet, customComplexes,
-    complexSpecies, complexDbPath, customYml, doRender
+    complexSpecies, complexDbPath, stringVersion, stringDir, linkTableColumns,
+    customYml, doRender
 ) {
     ## templateRmd
     .assertScalar(x = templateRmd, type = "character")
@@ -112,6 +113,8 @@
         .assertVector(x = stringIdCol, type = "character", allowNULL = TRUE)
     }
 
+    .assertVector(x = linkTableColumns, type = "character", allowNULL = TRUE)
+
     ## Score thresholds
     .assertScalar(x = minScore, type = "numeric")
     .assertScalar(x = minPeptides, type = "numeric")
@@ -178,6 +181,9 @@
         stop("'complexDbPath' must point to an existing file")
     }
 
+    .assertScalar(x = stringVersion, type = "character")
+    .assertScalar(x = stringDir, type = "character", allowNULL = TRUE)
+
     .assertScalar(x = customYml, type = "character", allowNULL = TRUE)
     if (!is.null(customYml) && !file.exists(customYml)) {
         stop("'customYml' must point to an existing file")