diff --git a/modules/nf-core/propr/grea/environment.yml b/modules/nf-core/propr/grea/environment.yml index 2bb015a1047..9744dab906b 100644 --- a/modules/nf-core/propr/grea/environment.yml +++ b/modules/nf-core/propr/grea/environment.yml @@ -1,5 +1,7 @@ channels: - conda-forge - bioconda + dependencies: - - conda-forge::r-propr=5.0.4 + - bioconda::bioconductor-limma=3.58.1 + - conda-forge::r-propr=5.1.5 diff --git a/modules/nf-core/propr/grea/main.nf b/modules/nf-core/propr/grea/main.nf index d2e1ee6de9a..fd727208d94 100644 --- a/modules/nf-core/propr/grea/main.nf +++ b/modules/nf-core/propr/grea/main.nf @@ -1,20 +1,20 @@ process PROPR_GREA { tag "$meta.id" - label 'process_single' + label 'process_high' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/r-propr:5.0.4': - 'biocontainers/r-propr:5.0.4' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b6/b65f7192866fbd9a947df15b104808abb720e7a224bbe3ca8f7f8f680f52c97a/data' : + 'community.wave.seqera.io/library/bioconductor-limma_r-propr:f52f1d4fea746393' }" input: - tuple val(meta), path(adj) + tuple val(meta), path(adjacency) tuple val(meta2), path(gmt) output: - tuple val(meta), path("*.go.tsv"), emit: enrichedGO - path "versions.yml", emit: versions - path "*.R_sessionInfo.log", emit: session_info + tuple val(meta), path("*.grea.tsv"), emit: results + path "versions.yml", emit: versions + path "*.R_sessionInfo.log", emit: session_info when: task.ext.when == null || task.ext.when diff --git a/modules/nf-core/propr/grea/meta.yml b/modules/nf-core/propr/grea/meta.yml index 58f73fc4d86..cc0613d3dab 100644 --- a/modules/nf-core/propr/grea/meta.yml +++ b/modules/nf-core/propr/grea/meta.yml @@ -2,12 +2,12 @@ name: "propr_grea" description: Perform Gene Ratio Enrichment Analysis keywords: - - logratio - - differential - propr - grea - - enrichment - - expression + - logratio + - differential expression + - functional enrichment + - functional analysis tools: - "grea": description: "Gene Ratio Enrichment Analysis" @@ -21,34 +21,42 @@ input: - - meta: type: map description: | - Groovy Map containing sample information. + Groovy Map containing data information. This can be used at the workflow level to pass optional parameters to the module. [id: 'test', ...] - - adj: + - adjacency: type: file - description: adjacency matrix for gene ratio proportionality/differential proportionality + description: | + Adjacency matrix representing the graph connections (ie. 1 for edges, 0 otherwise). + This can be the adjacency matrix output from gene ratio approaches like propr/propd. pattern: "*.{csv,tsv}" - - meta2: type: map description: | - Groovy map containing study-wide metadata related to the knowledge database + Groovy Map containing data information. + This can be used at the workflow level to pass optional parameters to the module. + [id: 'test', ...] - gmt: type: file - description: relational database containing genes and GO terms (generated by - mygene module) + description: | + A tab delimited file format that describes gene sets. The first column is the + concept id (eg. GO term, pathway, etc), the second column is the concept description, and the + rest are nodes (eg. genes) that is associated to the given concept. pattern: "*.{gmt}" output: - - enrichedGO: + - results: - meta: - type: map + type: file description: | Groovy Map containing sample information. This can be used at the workflow level to pass optional parameters to the module. [id: 'test', ...] - - "*.go.tsv": + - "*.grea.tsv": type: file - description: File containing GO terms and their enrichment values - pattern: "*.{csv}" + description: | + Output file containing the information about the tested concepts (ie. gene sets) + and enrichment statistics. + pattern: "*.grea.tsv" - versions: - versions.yml: type: file @@ -57,9 +65,11 @@ output: - session_info: - "*.R_sessionInfo.log": type: file - description: R session log + description: dump of R SessionInfo pattern: "*.R_sessionInfo.log" authors: - "@caraiz2001" + - "@suzannejin" maintainers: - "@caraiz2001" + - "@suzannejin" diff --git a/modules/nf-core/propr/grea/templates/grea.R b/modules/nf-core/propr/grea/templates/grea.R index 2d568b70330..3b761f89b1c 100644 --- a/modules/nf-core/propr/grea/templates/grea.R +++ b/modules/nf-core/propr/grea/templates/grea.R @@ -51,66 +51,44 @@ read_delim_flexible <- function(file, header = TRUE, row.names = 1, check.names ) } -#' Converts the .gmt file into a df +#' Loads the .gmt file and converts it into a knowledge database #' -#' @param file_gmt_path path of the .gmt file provided by mygene module. -#' @return output dataframe a Dataframe: 1st column = GOterm, 2nd = Description, 3d to end = genes. -process_gmt_file <- function(file_gmt_path) { - - lines <- readLines(file_gmt_path) - data_list <- list() - - for (line in lines) { - fields <- strsplit(line, "\\t")[[1]] # Split the line based on the tab character - go_term <- fields[1] # Extract the GO term - - # Create a data frame with the GO term in the first column - # Fill in missing values with NA to ensure consistent column lengths - data_list[[go_term]] <- data.frame(GOterm = go_term, - Description = fields[2], - GeneIDs = c(fields[3:length(fields)], rep(NA, max(0, 3 - length(fields))))) +#' @param filename path of the .gmt file +#' @param nodes vector of node (eg. gene) names. Note that this set should be as +#' complete as possible. So it should not only contain the target genes but also +#' the background genes. +#' @return a list with: +#' `db` A knowledge database (matrix) where each row is a graph node (eg. gene) +#' and each column is a concept (eg. GO term, pathway, etc). +#' `description` A list of descriptions for each concept. +load_gmt <- function(filename, nodes) { + + # read gmt file + gmt <- readLines(filename) + gmt <- strsplit(gmt, "\\t") + + # initialize database matrix + db <- matrix(0, nrow = length(nodes), ncol = length(gmt)) + rownames(db) <- nodes + colnames(db) <- sapply(gmt, function(entry) entry[[1]]) + + # description of the concepts + description <- list() + + # for concept in gmt + for (i in 1:length(gmt)) { + + # get concept and description + concept <- gmt[[i]][[1]] + description[[concept]] <- gmt[[i]][[2]] + + # fill 1 if gene is in concept + nodes_in_concept <- gmt[[i]][-c(1, 2)] + nodes_in_concept <- nodes_in_concept[nodes_in_concept %in% nodes] + db[nodes_in_concept, i] <- 1 } - gmt_df <- do.call(rbind, data_list) # Combine all data frames into a single data frame - gmt_df\$GeneIDs <- as.character(gmt_df\$GeneIDs) # Convert gene IDs to character to avoid coercion - - return(gmt_df) -} - -#' Converts the .gmt data frame into a knowledge matrix (contingency table) -#' -#' @param gmt_df .gmt df created by process_gmt_file -#' @return output dataframe. A knowledge database where each row is a graph node (gene) -#' and each column is a concept (GO term). -gmt_to_K<- function(gmt_df){ - - summ_df <- as.data.frame(gmt_df\$GeneIDs) - summ_df <- cbind(summ_df, as.data.frame(gmt_df\$GOterm)) - colnames(summ_df)<- c("GeneIDs", "GOterm") - summ_df<- unique(summ_df) - - summ_df\$value <- 1 - - K <- table(summ_df\$GeneIDs, summ_df\$GOterm) - K <- as.data.frame.matrix(K) - - return(K) -} - -#' Expands knowledge matrix with missing genes to ensure same number of rows for A and K -#' -#' @param adjacency_matrix gene x gene correlation or proportionality adjacency matrix (output propr/propd) -#' @return output dataframe. A knowledge database where each row is a graph node (gene) -#' and each column is a concept (GO term). -add_missing <- function(adjacency_matrix, knowledge_matrix){ - - missing_genes <- setdiff(rownames(adjacency_matrix), rownames(knowledge_matrix)) - extra_rows <- data.frame(matrix(0, nrow = length(missing_genes), ncol = ncol(knowledge_matrix))) - rownames(extra_rows) <- missing_genes - colnames(extra_rows) <- colnames(knowledge_matrix) - - knowledge_matrix <- rbind(knowledge_matrix, extra_rows) - return(knowledge_matrix) + return(list(db = db, description = description)) } ################################################ @@ -119,53 +97,70 @@ add_missing <- function(adjacency_matrix, knowledge_matrix){ ################################################ ################################################ +# Set defaults and classes + opt <- list( - adj = '$adj', - gmt = '$gmt', prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix'), - permutation = 100, - fixseed = TRUE, + + # input data + adj = '$adjacency', # adjacency matrix + gmt = '$gmt', # knowledge database .gmt file + + # parameters for gene sets + set_min = 15, # minimum number of genes in a set + set_max = 500, # maximum number of genes in a set + + # parameters for permutation test + permutation = 100, # number of permutations to perform + + # other parameters + seed = NA, # seed for reproducibility + round_digits = NA, # number of digits to round results ncores = as.integer('$task.cpus') ) opt_types <- list( + prefix = 'character', adj = 'character', gmt = 'character', - prefix = 'character', + set_min = 'numeric', + set_max = 'numeric', permutation = 'numeric', - fixseed = 'logical', + seed = 'numeric', + round_digits = 'numeric', ncores = 'numeric' ) # Apply parameter overrides -args_opt <- parse_args('$task.ext.args') +args_ext <- ifelse('$task.ext.args' == 'null', '', '$task.ext.args') +args_opt <- parse_args(args_ext) for ( ao in names(args_opt)){ if (! ao %in% names(opt)){ stop(paste("Invalid option:", ao)) } else { # Preserve classes from defaults where possible - if (! is.null(opt[[ao]])){ - args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) - } - # set NA - if (args_opt[[ao]] %in% c('NA', NA, 'null')){ - args_opt[[ao]] <- NA - } + args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) + + # handle NA, and avoid errors when NA is provided by user as character + if (args_opt[[ao]] %in% c('NA', NA)) args_opt[[ao]] <- NA + + # replace values opt[[ao]] <- args_opt[[ao]] } } # Check if required parameters have been provided + required_opts <- c('adj', 'gmt') # defines a vector required_opts containing the names of the required parameters. missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] if (length(missing) > 0){ stop(paste("Missing required options:", paste(missing, collapse=', '))) } - # Check file inputs are valid + for (file_input in c('adj', 'gmt')){ if (is.null(opt[[file_input]])) { stop(paste("Please provide", file_input), call. = FALSE) @@ -175,6 +170,12 @@ for (file_input in c('adj', 'gmt')){ } } +# check parameters are valid + +if (opt\$permutation < 0) { + stop('permutation should be a positive integer') +} + ################################################ ################################################ ## Finish loading libraries ## @@ -189,20 +190,63 @@ library(propr) ################################################ ################################################ -# Read gene x gene adjacency matrix -A <- read_delim_flexible(opt\$adj, header = TRUE, row.names = 1, check.names = TRUE) +# set seed when required + +if (!is.na(opt\$seed)) { + warning('Setting seed ', opt\$seed, ' for reproducibility') + set.seed(opt\$seed) +} + +# load adjacency matrix +# this matrix should have gene x gene dimensions + +message("Loading input data") + +adj <- as.matrix(read_delim_flexible( + opt\$adj, + header = TRUE, + row.names = 1, + check.names = FALSE +)) +if (nrow(adj) != ncol(adj)) { + stop('Adjacency matrix should be a squared matrix that reflects the connections between all the nodes') +} +if (!all(rownames(adj) == colnames(adj))) { + stop('Adjacency matrix row names are not equal to column names') +} + +# load and process knowledge database + +gmt <- load_gmt( + opt\$gmt, + rownames(adj) # adj should contain all the nodes (target and background) +) -# Read and process gene x GO term matrix -gmt_df <- process_gmt_file(opt\$gmt) -K <- gmt_to_K(gmt_df) +# filter gene sets +# gene sets with less than set_min or more than set_max genes are removed -# Ensure same number of rows (genes) -if (nrow(A) != nrow(K)){ - K <- add_missing(A, K) +idx <- which(colSums(gmt\$db) > opt\$set_min & colSums(gmt\$db) < opt\$set_max) +if (length(idx) == 0){ + stop("No gene set pass the filter of set_min=", opt\$set_min, " and set_max=", opt\$set_max) } +gmt\$db <- gmt\$db[, idx] +gmt\$description <- gmt\$description[idx] -# Run Graflex -G <- runGraflex(A, K, opt\$permutation, opt\$fixseed) +# run GREA +# Basically, it calculates the odds ratio of the graph being enriched in each concept, +# and the FDR of the odds ratio through permutation tests + +message("Running GREA") + +odds <- runGraflex( + adj, + gmt\$db, + p=opt\$permutation, + ncores=opt\$ncores +) +odds\$Description <- sapply(odds\$Concept, function(concept) + gmt\$description[[concept]] +) ################################################ ################################################ @@ -210,14 +254,19 @@ G <- runGraflex(A, K, opt\$permutation, opt\$fixseed) ################################################ ################################################ +if (!is.na(opt\$round_digits)) { + for (col in c('Odds', 'LogOR', 'FDR.under', 'FDR.over')){ + odds[,col] <- round(odds[,col], opt\$round_digits) + } +} + write.table( - G, - file = paste0(opt\$prefix, '.go.tsv'), + odds, + file = paste0(opt\$prefix, '.grea.tsv'), col.names = TRUE, - row.names = TRUE, + row.names = FALSE, sep = '\\t', quote = FALSE - ) ################################################ @@ -236,13 +285,11 @@ sink() ################################################ ################################################ -r.version <- strsplit(version[['version.string']], ' ')[[1]][3] propr.version <- as.character(packageVersion('propr')) writeLines( c( '"${task.process}":', - paste(' r-base:', r.version), paste(' r-propr:', propr.version) ), 'versions.yml') diff --git a/modules/nf-core/propr/grea/tests/grea_test.config b/modules/nf-core/propr/grea/tests/grea_test.config index 8d0d229a76d..7d354c013a5 100644 --- a/modules/nf-core/propr/grea/tests/grea_test.config +++ b/modules/nf-core/propr/grea/tests/grea_test.config @@ -1,8 +1,14 @@ process { - withName: "PROPR_PROPR"{ - ext.args = { "--adjacency true --permutation 5 --fixseed true --cutoff_min 0.05 --cutoff_max 0.95 --cutoff_interval 0.05"} + // set single core for reproducibility + // NOTE this method relies on parallelization and permutation tests + // The permutations are done within each node, which makes set.seed not working properly when + // different nodes are starting/ending depending on the case + cpus = 1 + + withName: "PROPR_PROPD"{ + ext.args = { "--round_digits 5 --save_adjacency true --features_id_col gene_name"} } withName: "PROPR_GREA"{ - ext.args = { "--permutation 5 --fixseed true"} + ext.args = { "--permutation 10 --set_min 10 --seed 123 --round_digits 5"} } -} \ No newline at end of file +} diff --git a/modules/nf-core/propr/grea/tests/main.nf.test b/modules/nf-core/propr/grea/tests/main.nf.test index dd442b43459..38a015e4b8c 100644 --- a/modules/nf-core/propr/grea/tests/main.nf.test +++ b/modules/nf-core/propr/grea/tests/main.nf.test @@ -8,34 +8,34 @@ nextflow_process { tag "modules_nfcore" tag "propr" tag "propr/grea" - tag "mygene" - tag "propr/propr" + tag "propr/propd" - test("grea chained to propr using default options") { + test("test grea chained to propd") { tag "default" config "./grea_test.config" setup { - run("PROPR_PROPR") { - script "../../propr/main.nf" + run("PROPR_PROPD") { + script "../../propd/main.nf" process { """ - input[0] = [ - [ id:'test' ], - file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv") - ] - """ - } - } - run("MYGENE") { - script "../../../mygene/main.nf" - process { - """ - input[0] = [ - [id : 'test'], - file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.gene_meta.tsv") + expression_test_data_dir = params.modules_testdata_base_path + 'genomics/mus_musculus/rnaseq_expression/' + + ch_contrasts = Channel.fromPath(file(expression_test_data_dir + 'SRP254919.contrasts.csv', checkIfExists: true)) + .splitCsv ( header:true, sep:',' ) + .map{ + tuple(it, it.variable, it.reference, it.target) + } + .first() + ch_matrix = [ + [id: 'test'], + file(expression_test_data_dir + 'SRP254919.samplesheet.csv', checkIfExists: true), + file(expression_test_data_dir + 'SRP254919.salmon.merged.gene_counts.top1000cov.tsv', checkIfExists: true) ] + + input[0] = ch_contrasts + input[1] = ch_matrix """ } } @@ -44,8 +44,11 @@ nextflow_process { when { process { """ - input[0] = PROPR_PROPR.out.adj.collect{ meta, adj -> adj }.map{ adj -> [[ id: 'test_adj'], adj]} - input[1] = MYGENE.out.gmt.collect{ meta, gmt -> gmt }.map{ gmt -> [[ id: 'test_gmt'], gmt]} + input[0] = PROPR_PROPD.out.adjacency + input[1] = [ + [id: 'test'], + file(params.modules_testdata_base_path + 'genomics/mus_musculus/gene_set_analysis/mh.all.v2022.1.Mm.symbols.gmt', checkIfExists: true) + ] """ } } @@ -53,10 +56,11 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out.enrichedGO).match("grea chained to propr using default options - enrichedGO") }, - { assert snapshot(process.out.versions).match("versions") } - + { assert snapshot( + process.out.results, + process.out.versions + ).match()} ) } } -} \ No newline at end of file +} diff --git a/modules/nf-core/propr/grea/tests/main.nf.test.snap b/modules/nf-core/propr/grea/tests/main.nf.test.snap index 2db674fc5ae..6c5dd533ed8 100644 --- a/modules/nf-core/propr/grea/tests/main.nf.test.snap +++ b/modules/nf-core/propr/grea/tests/main.nf.test.snap @@ -1,31 +1,26 @@ { - "versions": { - "content": [ - [ - "versions.yml:md5,222a7a8b79b5a2987637279847c609d1" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-04-29T10:45:07.582509" - }, - "grea chained to propr using default options - enrichedGO": { + "test grea chained to propd": { "content": [ [ [ { - "id": "test_adj" + "id": "treatment_mCherry_hND6_", + "variable": "treatment", + "reference": "mCherry", + "target": "hND6", + "blocking": "" }, - "test_adj.go.tsv:md5,904e1fe3eed0f2dded8e5b64321a0269" + "treatment_mCherry_hND6_.grea.tsv:md5,786faeccf39926d2f7c980ef549a2697" ] + ], + [ + "versions.yml:md5,060fcd8ce4afc482e237fa75686a0aba" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-08-03T16:06:25.669444" + "timestamp": "2024-12-11T13:00:02.026244403" } -} \ No newline at end of file +}