Skip to content

Commit

Permalink
Vendored the specification validation library, added bindings to R.
Browse files Browse the repository at this point in the history
This also prompted some fixes to the prepareDatabaseFiles examples with
respect to the uniqueness of simulated gene indices for each set.
  • Loading branch information
LTLA committed Nov 15, 2024
1 parent 7c3c6f4 commit 2d13c34
Show file tree
Hide file tree
Showing 23 changed files with 1,112 additions and 6 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
^\.github$
^\.gitignore$
^vendor.sh$
^_spec$
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
*.swp
*.html
_spec
*.o
*.so
10 changes: 7 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: gesel
Version: 0.1.1
Date: 2024-11-06
Version: 0.1.2
Date: 2024-11-15
Title: Search for Interesting Gene Sets
License: MIT + file LICENSE
Description:
Expand All @@ -13,12 +13,16 @@ Imports:
utils,
methods,
rappdirs,
httr2
httr2,
Rcpp
Suggests:
BiocStyle,
knitr,
testthat,
rmarkdown
LinkingTo:
assorthead,
Rcpp
VignetteBuilder: knitr
Encoding: UTF-8
RoxygenNote: 7.3.2
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,13 @@ export(newConfig)
export(prepareDatabaseFiles)
export(searchGenes)
export(searchSetText)
export(validateDatabaseFiles)
export(validateGeneFiles)
import(httr2)
import(methods)
importFrom(Rcpp,sourceCpp)
importFrom(rappdirs,user_cache_dir)
importFrom(utils,URLencode)
importFrom(utils,head)
importFrom(utils,write.table)
useDynLib(gesel)
11 changes: 11 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

validate_database_files <- function(db_prefix, num_genes) {
.Call('_gesel_validate_database_files', PACKAGE = 'gesel', db_prefix, num_genes)
}

validate_gene_files <- function(gene_prefix, types) {
.Call('_gesel_validate_gene_files', PACKAGE = 'gesel', gene_prefix, types)
}

5 changes: 3 additions & 2 deletions R/prepareDatabaseFiles.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' @param set.info Data frame of information about each gene set, where each row corresponds to a set.
#' This data frame should contain the same columns as that returned by \code{\link{fetchAllSets}}.
#' @param set.membership List of integer vectors, where each vector corresponds to a gene set and contains the indices of its constituent genes.
#' All gene indices should be positive and no greater than \code{num.genes}.
#' All gene indices should be positive, no greater than \code{num.genes}, and unique within each set.
#' @param num.genes Integer scalar specifying the total number of genes available for this species.
#'
#' @return Several files are produced at \code{path} with the \code{<species>_} prefix.
Expand Down Expand Up @@ -48,6 +48,7 @@
#' seq_len(nrow(set.info))
#' )
#' )
#' set.membership <- lapply(set.membership, unique)
#' set.info$size <- lengths(set.membership)
#'
#' # Now making the database files.
Expand Down Expand Up @@ -122,7 +123,7 @@ save_integer_list <- function(x, prefix, include.names = FALSE) {
for (i in seq_along(x)) {
z <- x[[i]]
if (length(z)) {
z <- sort(unique(z)) # convert to diffs to reduce integer size
z <- sort(z) # convert to diffs to reduce integer size
z <- c(z[1] - 1L, diff(z)) # get to 0-based indexing with delta encoding.
lines[i] <- paste(z, collapse="\t")
}
Expand Down
35 changes: 35 additions & 0 deletions R/validateDatabaseFiles.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#' Validate Gesel database files
#'
#' Validate Gesel database and gene mapping files against the specification at \url{https://github.com/gesel-inc/gesel-spec}.
#'
#' @param species String specifying the species in the form of its NCBI taxonomy ID.
#' @param path String containing the path to a directory containing the database files or gene mapping files, for \code{validateDatabaseFiles} and \code{validateGeneFiles} respectively.
#' @param num.genes Integer scalar specifying the total number of genes available for this species.
#' @param types Character vector specifying the types of gene names to validate, e.g.,\code{"symbol"}, \code{"entrez"}, or \code{"ensembl"},
#' If \code{NULL}, all detected files for \code{species} in \code{path} are checked.
#'
#' @return \code{validateDatabaseFiles} returns \code{NULL} invisibly.
#'
#' \code{validateGeneFiles} returns the number of genes, to be used as \code{num.genes}.
#'
#' In both functions, invalid formatting will cause an error to be raised.
#'
#' @author Aaron Lun
#'
#' @examples
#' example(prepareDatabaseFiles, echo=FALSE)
#' validateDatabaseFiles(output, "9606", num.genes)
#'
#' @export
#' @importFrom Rcpp sourceCpp
#' @useDynLib gesel
validateDatabaseFiles <- function(path, species, num.genes) {
validate_database_files(file.path(path, paste0(species, "_")), num.genes)
invisible(NULL)
}

#' @export
#' @rdname validateDatabaseFiles
validateGeneFiles <- function(path, species, types=NULL) {
validate_gene_files(file.path(path, paste0(species, "_")), types)
}
3 changes: 2 additions & 1 deletion man/prepareDatabaseFiles.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions man/validateDatabaseFiles.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 45 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#include <Rcpp.h>

using namespace Rcpp;

#ifdef RCPP_USE_GLOBAL_ROSTREAM
Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
#endif

// validate_database_files
SEXP validate_database_files(std::string db_prefix, int num_genes);
RcppExport SEXP _gesel_validate_database_files(SEXP db_prefixSEXP, SEXP num_genesSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::traits::input_parameter< std::string >::type db_prefix(db_prefixSEXP);
Rcpp::traits::input_parameter< int >::type num_genes(num_genesSEXP);
rcpp_result_gen = Rcpp::wrap(validate_database_files(db_prefix, num_genes));
return rcpp_result_gen;
END_RCPP
}
// validate_gene_files
int validate_gene_files(std::string gene_prefix, Rcpp::Nullable<Rcpp::CharacterVector> types);
RcppExport SEXP _gesel_validate_gene_files(SEXP gene_prefixSEXP, SEXP typesSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::traits::input_parameter< std::string >::type gene_prefix(gene_prefixSEXP);
Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type types(typesSEXP);
rcpp_result_gen = Rcpp::wrap(validate_gene_files(gene_prefix, types));
return rcpp_result_gen;
END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_gesel_validate_database_files", (DL_FUNC) &_gesel_validate_database_files, 2},
{"_gesel_validate_gene_files", (DL_FUNC) &_gesel_validate_gene_files, 2},
{NULL, NULL, 0}
};

RcppExport void R_init_gesel(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}
90 changes: 90 additions & 0 deletions src/gesel/check_collection_details.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#ifndef GESEL_CHECK_COLLECTION_DETAILS_HPP
#define GESEL_CHECK_COLLECTION_DETAILS_HPP

#include <string>
#include <cstdint>
#include <vector>

#include "byteme/byteme.hpp"

#include "parse_field.hpp"

namespace gesel {

namespace internal {

inline void check_collection_details(const std::string& path, const std::vector<uint64_t>& ranges, const std::vector<uint64_t>& numbers) {
byteme::RawFileReader raw_r(path);
auto gzpath = path + ".gz";
byteme::GzipFileReader gzip_r(gzpath);

byteme::PerByte raw_p(&raw_r);
byteme::PerByte gzip_p(&gzip_r);

bool raw_valid = raw_p.valid();
bool gzip_valid = gzip_p.valid();
uint64_t line = 0;
const uint64_t num_ranges = ranges.size();

while (raw_valid) {
auto raw_pos = raw_p.position();
auto title = parse_string_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line);
auto description = parse_string_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line);
auto species = parse_integer_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line);
auto maintainer = parse_string_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line);
auto source = parse_string_field<FieldType::LAST>(raw_p, raw_valid, path, line);

if (line >= num_ranges) {
throw std::runtime_error("number of lines in '" + path + "' exceeds that expected from its '*.ranges.gz' file " + append_line_number(line));
}
if (raw_p.position() - raw_pos - 1 != static_cast<size_t>(ranges[line])) {
throw std::runtime_error("number of bytes per line in '" + path + "' is not the same as that expected from the '*.ranges.gz' file " + append_line_number(line));
}

if (!gzip_valid) {
throw std::runtime_error("early termination of the Gzipped version of '" + path + "'");
}

auto gz_title = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line);
if (gz_title != title) {
throw std::runtime_error("different title in '" + path + "' compared to its Gzipped version " + append_line_number(line));
}

auto gz_description = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line);
if (gz_description != description) {
throw std::runtime_error("different description in '" + path + "' compared to its Gzipped version " + append_line_number(line));
}

auto gz_species = parse_integer_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line);
if (gz_species != species) {
throw std::runtime_error("different species in '" + path + "' compared to its Gzipped version " + append_line_number(line));
}

auto gz_maintainer = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line);
if (gz_maintainer != maintainer) {
throw std::runtime_error("different maintainer in '" + path + "' compared to its Gzipped version " + append_line_number(line));
}

auto gz_source = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line);
if (gz_source != source) {
throw std::runtime_error("different source in '" + path + "' compared to its Gzipped version " + append_line_number(line));
}

auto gz_number = parse_integer_field<FieldType::LAST>(gzip_p, gzip_valid, path, line);
if (gz_number != numbers[line]) {
throw std::runtime_error("different number in '" + path + ".gz' compared to its '*.ranges.gz' file " + append_line_number(line));
}

++line;
}

if (line != num_ranges) {
throw std::runtime_error("number of lines in '" + path + "' is less than that expected from its '*.ranges.gz' file " + append_line_number(line));
}
}

}

}

#endif
63 changes: 63 additions & 0 deletions src/gesel/check_genes.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#ifndef GESEL_CHECK_GENES_HPP
#define GESEL_CHECK_GENES_HPP

#include <limits>
#include <cstdint>
#include <vector>
#include <string>
#include <stdexcept>
#include <unordered_set>

#include "byteme/byteme.hpp"

#include "parse_field.hpp"
#include "utils.hpp"

namespace gesel {

namespace internal {

inline uint64_t check_genes(const std::string& path) {
byteme::GzipFileReader reader(path);
byteme::PerByte pb(&reader);
std::vector<uint64_t> output;

bool valid = pb.valid();
uint64_t line = 0;
constexpr uint64_t max_line = std::numeric_limits<uint64_t>::max();
std::unordered_set<std::string> current_names;

while (valid) {
if (pb.get() == '\n') {
valid = pb.advance();
} else {
current_names.clear();
do {
auto parsed = parse_string_field<FieldType::UNKNOWN>(pb, valid, path, line);
if (parsed.first == "") {
throw std::runtime_error("empty name detected in '" + path + "' " + append_line_number(line));
}
if (current_names.find(parsed.first) != current_names.end()) {
throw std::runtime_error("duplicated names detected in '" + path + "' " + append_line_number(line));
}
if (parsed.second) {
break;
}
current_names.insert(parsed.first);
} while (true);
}

if (line == max_line) {
throw std::runtime_error("number of lines should fit in a 32-bit integer");
}
++line;
}

return line;
}

}

}

#endif
Loading

0 comments on commit 2d13c34

Please sign in to comment.