Skip to content

Commit

Permalink
Pass cache and fetch= functions via a configuration object.
Browse files Browse the repository at this point in the history
This was motivated by the realization that users could change the fetch
arguments to point to different databases, but gesel functions would
store everything in the same cache. Now, the cache is passed as an
explicit argument that corresponds to a specific set of fetch functions.

This change also makes it easier to call gesel functions with different
fetch functions, because we can just pass a single configuration object
rather than having to remember to pass all of the fetch= arguments.
  • Loading branch information
LTLA committed Nov 6, 2024
1 parent d26b3e4 commit 7ce81e0
Show file tree
Hide file tree
Showing 53 changed files with 433 additions and 387 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: gesel
Version: 0.1.0
Date: 2024-10-27
Version: 0.1.1
Date: 2024-11-06
Title: Search for Interesting Gene Sets
License: MIT + file LICENSE
Description:
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ export(findOverlappingSets)
export(flushMemoryCache)
export(geneUrl)
export(mapGenesByName)
export(newConfig)
export(prepareDatabaseFiles)
export(searchGenes)
export(searchSetText)
Expand Down
41 changes: 0 additions & 41 deletions R/cache.R

This file was deleted.

13 changes: 7 additions & 6 deletions R/fetchAllCollections.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
#' Fetch information about all gene set collections in the Gesel database.
#'
#' @param species String containing the NCBI taxonomy ID of the species of interest.
#' @param fetch Function that accepts the name of a Gesel database file and returns an absolute path to that file.
#' @param fetch.args Named list of arguments to pass to \code{fetch}.
#' @param config Configuration list, typically created by \code{\link{newConfig}}.
#' If \code{NULL}, the default configuration is used.
#'
#' @return Data frame of gene set collection information.
#' Each row represents a collection and contains:
Expand All @@ -25,14 +25,15 @@
#'
#' @export
#' @importFrom utils head
fetchAllCollections <- function(species, fetch = downloadDatabaseFile, fetch.args = list()) {
candidate <- get_cache("fetchAllCollections", species)
fetchAllCollections <- function(species, config = NULL) {
config <- get_config(config)
candidate <- get_cache(config, "fetchAllCollections", species)
if (!is.null(candidate)) {
return(candidate)
}

fname <- paste0(species, "_collections.tsv.gz")
path <- do.call(fetch, c(list(fname), fetch.args))
path <- fetch_file(config, fname)
raw <- decompress_lines(path)
details <- strsplit(raw, "\t")

Expand All @@ -56,6 +57,6 @@ fetchAllCollections <- function(species, fetch = downloadDatabaseFile, fetch.arg
size=size
)

set_cache("fetchAllCollections", species, output)
set_cache(config, "fetchAllCollections", species, output)
output
}
14 changes: 7 additions & 7 deletions R/fetchAllGenes.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
#' @param types Character vector specifying the types of gene names to return.
#' This is typically one or more of \code{"symbol"}, \code{"entrez"}, and \code{"ensembl"},
#' defaulting to all of them.
#' @param fetch Function that accepts the name of the file in the Gesel gene descriptions and returns an absolute path to the file.
#' If \code{NULL}, it defaults to \code{\link{downloadGeneFile}}.
#' @param fetch.args Named list of arguments to pass to \code{fetch}.
#' @param config Configuration list, typically created by \code{\link{newConfig}}.
#' If \code{NULL}, the default configuration is used.
#'
#' @return Data frame where each row represents a gene.
#' Each column corresponds to one of the \code{types} and is a list of character vectors.
Expand All @@ -21,12 +20,13 @@
#' head(out$symbol)
#'
#' @export
fetchAllGenes <- function(species, types = NULL, fetch = downloadGeneFile, fetch.args = list()) {
fetchAllGenes <- function(species, types = NULL, config = NULL) {
if (is.null(types)) {
types <- c("symbol", "entrez", "ensembl")
}

cached <- get_cache("fetchAllGenes", species)
config <- get_config(config)
cached <- get_cache(config, "fetchAllGenes", species)
modified <- FALSE
if (is.null(cached)) {
cached <- list()
Expand All @@ -40,7 +40,7 @@ fetchAllGenes <- function(species, types = NULL, fetch = downloadGeneFile, fetch
next
}

path <- do.call(fetch, c(list(paste0(species, "_", t, ".tsv.gz")), fetch.args))
path <- fetch_gene(config, paste0(species, "_", t, ".tsv.gz"))
raw <- decompress_lines(path)
processed <- strsplit(raw, "\t")
for (i in seq_along(processed)) {
Expand All @@ -56,7 +56,7 @@ fetchAllGenes <- function(species, types = NULL, fetch = downloadGeneFile, fetch
}

if (modified) {
set_cache("fetchAllGenes", species, cached)
set_cache(config, "fetchAllGenes", species, cached)
}

do.call(data.frame, lapply(output, I))
Expand Down
11 changes: 6 additions & 5 deletions R/fetchAllSets.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@
#' head(out)
#'
#' @export
fetchAllSets <- function(species, fetch = downloadDatabaseFile, fetch.args = list()) {
candidate <- get_cache("fetchAllSets", species)
fetchAllSets <- function(species, config = NULL) {
config <- get_config(config)
candidate <- get_cache(config, "fetchAllSets", species)
if (!is.null(candidate)) {
return(candidate)
}

fname <- paste0(species, "_sets.tsv.gz")
path <- do.call(fetch, c(list(fname), fetch.args))
path <- fetch_file(config, fname)
raw <- decompress_lines(path)
details <- strsplit(raw, "\t")

Expand All @@ -41,7 +42,7 @@ fetchAllSets <- function(species, fetch = downloadDatabaseFile, fetch.args = lis
size[i] <- current[3]
}

info <- fetchAllCollections(species, fetch=fetch, fetch.args=fetch.args)
info <- fetchAllCollections(species, config=config)
output <- data.frame(
name=names,
description=desc,
Expand All @@ -50,6 +51,6 @@ fetchAllSets <- function(species, fetch = downloadDatabaseFile, fetch.args = lis
number=sequence(info$size)
)

set_cache("fetchAllSets", species, output)
set_cache(config, "fetchAllSets", species, output)
output
}
9 changes: 5 additions & 4 deletions R/fetchGenesForAllSets.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,18 @@
#' fetchAllSets("9606")[1,]
#'
#' @export
fetchGenesForAllSets <- function(species, fetch = downloadDatabaseFile, fetch.args = list()) {
candidate <- get_cache("fetchGenesForAllSets", species)
fetchGenesForAllSets <- function(species, config = NULL) {
config <- get_config(config)
candidate <- get_cache(config, "fetchGenesForAllSets", species)
if (!is.null(candidate)) {
return(candidate)
}

fname <- paste0(species, "_set2gene.tsv.gz")
path <- do.call(fetch, c(list(fname), fetch.args))
path <- fetch_file(config, fname)
raw <- decompress_lines(path)
output <- decode_indices(raw)

set_cache("fetchGenesForAllSets", species, output)
set_cache(config, "fetchGenesForAllSets", species, output)
output
}
22 changes: 9 additions & 13 deletions R/fetchGenesForSomeSets.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,8 @@
#' @param species String containing the NCBI taxonomy ID of the species of interest.
#' @param sets Integer vector containing set indices.
#' Each set index refers to a row in the data frame returned by \code{\link{fetchAllSets}}.
#' @param fetch.file Function that accepts the name of the file in the Gesel database and returns an absolute path to the file.
#' @param fetch.file.args Named list of arguments to pass to \code{fetch.file}.
#' @param fetch.range Function that accepts at least two arguments -
#' the name of the file in the Gesel database, and an integer vector of length 2 containing the zero-indexed half-open byte range to extract from the file
#' (see \code{\link{downloadDatabaseRanges}} for details).
#' It should return a string containing the contents of the specified byte range.
#' @param fetch.range.args Named list of arguments to pass to \code{fetch.file}.
#' @param config Configuration list, typically created by \code{\link{newConfig}}.
#' If \code{NULL}, the default configuration is used.
#'
#' @return List of integer vectors.
#' Each vector corresponds to a set in \code{sets} and contains the identities of its member genes.
Expand All @@ -28,18 +23,19 @@
#' head(gene.symbols[first.set[[1]]])
#'
#' @export
fetchGenesForSomeSets <- function(species, sets, fetch.file = downloadDatabaseFile, fetch.file.args = list(), fetch.range = downloadDatabaseRanges, fetch.range.args = list()) {
candidate <- get_cache("fetchGenesForAllSets", species)
fetchGenesForSomeSets <- function(species, sets, config = NULL) {
config <- get_config(config)
candidate <- get_cache(config, "fetchGenesForAllSets", species)
if (!is.null(candidate)) {
return(candidate[sets])
}

fname <- paste0(species, "_set2gene.tsv")
cached <- get_cache("fetchGenesForSomeSets", species)
cached <- get_cache(config, "fetchGenesForSomeSets", species)
modified <- FALSE

if (is.null(cached)) {
intervals <- retrieve_ranges(fname, fetch=fetch.file, fetch.args=fetch.file.args)
intervals <- retrieve_ranges(config, fname)
cached <- list(intervals = intervals, prior = list(set = integer(0), genes = list()))
modified <- TRUE
}
Expand All @@ -50,7 +46,7 @@ fetchGenesForSomeSets <- function(species, sets, fetch.file = downloadDatabaseFi
needed <- sort(setdiff(sets, prior.set))
if (length(needed)) {
intervals <- cached$intervals
deets <- do.call(fetch.range, c(list(name=fname, start=intervals[needed], end=intervals[needed + 1L]), fetch.range.args))
deets <- fetch_range(config, fname, intervals[needed], intervals[needed + 1L])
prior.set <- c(prior.set, needed)
prior.genes <- c(prior.genes, decode_indices(deets))
modified <- TRUE
Expand All @@ -59,7 +55,7 @@ fetchGenesForSomeSets <- function(species, sets, fetch.file = downloadDatabaseFi
if (modified) {
cached$prior$set <- prior.set
cached$prior$genes <- prior.genes
set_cache("fetchGenesForSomeSets", species, cached)
set_cache(config, "fetchGenesForSomeSets", species, cached)
}

m <- match(sets, prior.set)
Expand Down
9 changes: 5 additions & 4 deletions R/fetchSetsForAllGenes.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,18 @@
#' fetchAllGenes("9606")$symbol[1]
#'
#' @export
fetchSetsForAllGenes <- function(species, fetch = downloadDatabaseFile, fetch.args = list()) {
candidate <- get_cache("fetchSetsForAllGenes", species)
fetchSetsForAllGenes <- function(species, config = NULL) {
config <- get_config(config)
candidate <- get_cache(config, "fetchSetsForAllGenes", species)
if (!is.null(candidate)) {
return(candidate)
}

fname <- paste0(species, "_gene2set.tsv.gz")
path <- do.call(fetch, c(list(fname), fetch.args))
path <- fetch_file(config, fname)
raw <- decompress_lines(path)
output <- decode_indices(raw)

set_cache("fetchSetsForAllGenes", species, output)
set_cache(config, "fetchSetsForAllGenes", species, output)
output
}
26 changes: 14 additions & 12 deletions R/fetchSetsForSomeGenes.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@
#' head(all.set.info[first.gene[[1]],])
#'
#' @export
fetchSetsForSomeGenes <- function(species, genes, fetch.file = downloadDatabaseFile, fetch.file.args = list(), fetch.range = downloadDatabaseRanges, fetch.range.args = list()) {
candidate <- get_cache("fetchSetsForAllGenes", species)
fetchSetsForSomeGenes <- function(species, genes, config = NULL) {
config <- get_config(config)
candidate <- get_cache(config, "fetchSetsForAllGenes", species)
if (!is.null(candidate)) {
return(candidate[genes])
}

fname <- paste0(species, "_gene2set.tsv")
raw.cached <- get_sets_for_some_genes_ranges(species, fname, fetch=fetch.file, fetch.args=fetch.file.args)
raw.cached <- get_sets_for_some_genes_ranges(config, species, fname)
cached <- raw.cached$cached
modified <- raw.cached$modified

Expand All @@ -38,7 +39,7 @@ fetchSetsForSomeGenes <- function(species, genes, fetch.file = downloadDatabaseF
needed <- sort(setdiff(genes, prior.gene))
if (length(needed)) {
intervals <- cached$intervals
deets <- do.call(fetch.range, c(list(name=fname, start=intervals[needed], end=intervals[needed + 1L]), fetch.range.args))
deets <- fetch_range(config, fname, intervals[needed], intervals[needed + 1L])
prior.gene <- c(prior.gene, needed)
prior.sets <- c(prior.sets, decode_indices(deets))
modified <- TRUE
Expand All @@ -47,20 +48,20 @@ fetchSetsForSomeGenes <- function(species, genes, fetch.file = downloadDatabaseF
if (modified) {
cached$prior$gene <- prior.gene
cached$prior$sets <- prior.sets
set_cache("fetchSetsForSomeGenes", species, cached)
set_cache(config, "fetchSetsForSomeGenes", species, cached)
}

m <- match(genes, prior.gene)
prior.sets[m]
}

get_sets_for_some_genes_ranges <- function(species, fname, fetch, fetch.args) {
cached <- get_cache("fetchSetsForSomeGenes", species)
get_sets_for_some_genes_ranges <- function(config, species, fname) {
cached <- get_cache(config, "fetchSetsForSomeGenes", species)
if (!is.null(cached)) {
return(list(cached=cached, modified=FALSE))
}

intervals <- retrieve_ranges(fname, fetch=fetch, fetch.args=fetch.args)
intervals <- retrieve_ranges(config, fname)
cached <- list(intervals = intervals, prior = list(gene = integer(0), sets = list()))
return(list(cached=cached, modified=TRUE))
}
Expand All @@ -82,17 +83,18 @@ get_sets_for_some_genes_ranges <- function(species, fname, fetch, fetch.args) {
#' @author Aaron Lun
#'
#' @export
effectiveNumberOfGenes <- function(species, fetch = downloadDatabaseFile, fetch.args = list()) {
candidate <- get_cache("fetchSetsForAllGenes", species)
effectiveNumberOfGenes <- function(species, config = NULL) {
config <- get_config(config)
candidate <- get_cache(config, "fetchSetsForAllGenes", species)
if (!is.null(candidate)) {
return(sum(lengths(candidate) > 0L))
}

fname <- paste0(species, "_gene2set.tsv")
raw.cached <- get_sets_for_some_genes_ranges(species, fname, fetch=fetch, fetch.args=fetch.args)
raw.cached <- get_sets_for_some_genes_ranges(config, species, fname)
cached <- raw.cached$cached
if (raw.cached$modified) {
set_cache("fetchSetsForSomeGenes", species, cached)
set_cache(config, "fetchSetsForSomeGenes", species, cached)
}

sum(diff(cached$intervals) > 1L) # for the newline character.
Expand Down
Loading

0 comments on commit 7ce81e0

Please sign in to comment.