-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Vendored the specification validation library, added bindings to R.
This also prompted some fixes to the prepareDatabaseFiles examples with respect to the uniqueness of simulated gene indices for each set.
- Loading branch information
Showing
23 changed files
with
1,112 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,4 @@ | ||
^\.github$ | ||
^\.gitignore$ | ||
^vendor.sh$ | ||
^_spec$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
*.swp | ||
*.html | ||
_spec | ||
*.o | ||
*.so |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Generated by using Rcpp::compileAttributes() -> do not edit by hand | ||
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | ||
|
||
validate_database_files <- function(db_prefix, num_genes) { | ||
.Call('_gesel_validate_database_files', PACKAGE = 'gesel', db_prefix, num_genes) | ||
} | ||
|
||
validate_gene_files <- function(gene_prefix, types) { | ||
.Call('_gesel_validate_gene_files', PACKAGE = 'gesel', gene_prefix, types) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#' Validate Gesel database files | ||
#' | ||
#' Validate Gesel database and gene mapping files against the specification at \url{https://github.com/gesel-inc/gesel-spec}. | ||
#' | ||
#' @param species String specifying the species in the form of its NCBI taxonomy ID. | ||
#' @param path String containing the path to a directory containing the database files or gene mapping files, for \code{validateDatabaseFiles} and \code{validateGeneFiles} respectively. | ||
#' @param num.genes Integer scalar specifying the total number of genes available for this species. | ||
#' @param types Character vector specifying the types of gene names to validate, e.g.,\code{"symbol"}, \code{"entrez"}, or \code{"ensembl"}, | ||
#' If \code{NULL}, all detected files for \code{species} in \code{path} are checked. | ||
#' | ||
#' @return \code{validateDatabaseFiles} returns \code{NULL} invisibly. | ||
#' | ||
#' \code{validateGeneFiles} returns the number of genes, to be used as \code{num.genes}. | ||
#' | ||
#' In both functions, invalid formatting will cause an error to be raised. | ||
#' | ||
#' @author Aaron Lun | ||
#' | ||
#' @examples | ||
#' example(prepareDatabaseFiles, echo=FALSE) | ||
#' validateDatabaseFiles(output, "9606", num.genes) | ||
#' | ||
#' @export | ||
#' @importFrom Rcpp sourceCpp | ||
#' @useDynLib gesel | ||
validateDatabaseFiles <- function(path, species, num.genes) { | ||
validate_database_files(file.path(path, paste0(species, "_")), num.genes) | ||
invisible(NULL) | ||
} | ||
|
||
#' @export | ||
#' @rdname validateDatabaseFiles | ||
validateGeneFiles <- function(path, species, types=NULL) { | ||
validate_gene_files(file.path(path, paste0(species, "_")), types) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
// Generated by using Rcpp::compileAttributes() -> do not edit by hand | ||
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | ||
|
||
#include <Rcpp.h> | ||
|
||
using namespace Rcpp; | ||
|
||
#ifdef RCPP_USE_GLOBAL_ROSTREAM | ||
Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); | ||
Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); | ||
#endif | ||
|
||
// validate_database_files | ||
SEXP validate_database_files(std::string db_prefix, int num_genes); | ||
RcppExport SEXP _gesel_validate_database_files(SEXP db_prefixSEXP, SEXP num_genesSEXP) { | ||
BEGIN_RCPP | ||
Rcpp::RObject rcpp_result_gen; | ||
Rcpp::traits::input_parameter< std::string >::type db_prefix(db_prefixSEXP); | ||
Rcpp::traits::input_parameter< int >::type num_genes(num_genesSEXP); | ||
rcpp_result_gen = Rcpp::wrap(validate_database_files(db_prefix, num_genes)); | ||
return rcpp_result_gen; | ||
END_RCPP | ||
} | ||
// validate_gene_files | ||
int validate_gene_files(std::string gene_prefix, Rcpp::Nullable<Rcpp::CharacterVector> types); | ||
RcppExport SEXP _gesel_validate_gene_files(SEXP gene_prefixSEXP, SEXP typesSEXP) { | ||
BEGIN_RCPP | ||
Rcpp::RObject rcpp_result_gen; | ||
Rcpp::traits::input_parameter< std::string >::type gene_prefix(gene_prefixSEXP); | ||
Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type types(typesSEXP); | ||
rcpp_result_gen = Rcpp::wrap(validate_gene_files(gene_prefix, types)); | ||
return rcpp_result_gen; | ||
END_RCPP | ||
} | ||
|
||
static const R_CallMethodDef CallEntries[] = { | ||
{"_gesel_validate_database_files", (DL_FUNC) &_gesel_validate_database_files, 2}, | ||
{"_gesel_validate_gene_files", (DL_FUNC) &_gesel_validate_gene_files, 2}, | ||
{NULL, NULL, 0} | ||
}; | ||
|
||
RcppExport void R_init_gesel(DllInfo *dll) { | ||
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); | ||
R_useDynamicSymbols(dll, FALSE); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#ifndef GESEL_CHECK_COLLECTION_DETAILS_HPP | ||
#define GESEL_CHECK_COLLECTION_DETAILS_HPP | ||
|
||
#include <string> | ||
#include <cstdint> | ||
#include <vector> | ||
|
||
#include "byteme/byteme.hpp" | ||
|
||
#include "parse_field.hpp" | ||
|
||
namespace gesel { | ||
|
||
namespace internal { | ||
|
||
inline void check_collection_details(const std::string& path, const std::vector<uint64_t>& ranges, const std::vector<uint64_t>& numbers) { | ||
byteme::RawFileReader raw_r(path); | ||
auto gzpath = path + ".gz"; | ||
byteme::GzipFileReader gzip_r(gzpath); | ||
|
||
byteme::PerByte raw_p(&raw_r); | ||
byteme::PerByte gzip_p(&gzip_r); | ||
|
||
bool raw_valid = raw_p.valid(); | ||
bool gzip_valid = gzip_p.valid(); | ||
uint64_t line = 0; | ||
const uint64_t num_ranges = ranges.size(); | ||
|
||
while (raw_valid) { | ||
auto raw_pos = raw_p.position(); | ||
auto title = parse_string_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line); | ||
auto description = parse_string_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line); | ||
auto species = parse_integer_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line); | ||
auto maintainer = parse_string_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line); | ||
auto source = parse_string_field<FieldType::LAST>(raw_p, raw_valid, path, line); | ||
|
||
if (line >= num_ranges) { | ||
throw std::runtime_error("number of lines in '" + path + "' exceeds that expected from its '*.ranges.gz' file " + append_line_number(line)); | ||
} | ||
if (raw_p.position() - raw_pos - 1 != static_cast<size_t>(ranges[line])) { | ||
throw std::runtime_error("number of bytes per line in '" + path + "' is not the same as that expected from the '*.ranges.gz' file " + append_line_number(line)); | ||
} | ||
|
||
if (!gzip_valid) { | ||
throw std::runtime_error("early termination of the Gzipped version of '" + path + "'"); | ||
} | ||
|
||
auto gz_title = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line); | ||
if (gz_title != title) { | ||
throw std::runtime_error("different title in '" + path + "' compared to its Gzipped version " + append_line_number(line)); | ||
} | ||
|
||
auto gz_description = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line); | ||
if (gz_description != description) { | ||
throw std::runtime_error("different description in '" + path + "' compared to its Gzipped version " + append_line_number(line)); | ||
} | ||
|
||
auto gz_species = parse_integer_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line); | ||
if (gz_species != species) { | ||
throw std::runtime_error("different species in '" + path + "' compared to its Gzipped version " + append_line_number(line)); | ||
} | ||
|
||
auto gz_maintainer = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line); | ||
if (gz_maintainer != maintainer) { | ||
throw std::runtime_error("different maintainer in '" + path + "' compared to its Gzipped version " + append_line_number(line)); | ||
} | ||
|
||
auto gz_source = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line); | ||
if (gz_source != source) { | ||
throw std::runtime_error("different source in '" + path + "' compared to its Gzipped version " + append_line_number(line)); | ||
} | ||
|
||
auto gz_number = parse_integer_field<FieldType::LAST>(gzip_p, gzip_valid, path, line); | ||
if (gz_number != numbers[line]) { | ||
throw std::runtime_error("different number in '" + path + ".gz' compared to its '*.ranges.gz' file " + append_line_number(line)); | ||
} | ||
|
||
++line; | ||
} | ||
|
||
if (line != num_ranges) { | ||
throw std::runtime_error("number of lines in '" + path + "' is less than that expected from its '*.ranges.gz' file " + append_line_number(line)); | ||
} | ||
} | ||
|
||
} | ||
|
||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#ifndef GESEL_CHECK_GENES_HPP | ||
#define GESEL_CHECK_GENES_HPP | ||
|
||
#include <limits> | ||
#include <cstdint> | ||
#include <vector> | ||
#include <string> | ||
#include <stdexcept> | ||
#include <unordered_set> | ||
|
||
#include "byteme/byteme.hpp" | ||
|
||
#include "parse_field.hpp" | ||
#include "utils.hpp" | ||
|
||
namespace gesel { | ||
|
||
namespace internal { | ||
|
||
inline uint64_t check_genes(const std::string& path) { | ||
byteme::GzipFileReader reader(path); | ||
byteme::PerByte pb(&reader); | ||
std::vector<uint64_t> output; | ||
|
||
bool valid = pb.valid(); | ||
uint64_t line = 0; | ||
constexpr uint64_t max_line = std::numeric_limits<uint64_t>::max(); | ||
std::unordered_set<std::string> current_names; | ||
|
||
while (valid) { | ||
if (pb.get() == '\n') { | ||
valid = pb.advance(); | ||
} else { | ||
current_names.clear(); | ||
do { | ||
auto parsed = parse_string_field<FieldType::UNKNOWN>(pb, valid, path, line); | ||
if (parsed.first == "") { | ||
throw std::runtime_error("empty name detected in '" + path + "' " + append_line_number(line)); | ||
} | ||
if (current_names.find(parsed.first) != current_names.end()) { | ||
throw std::runtime_error("duplicated names detected in '" + path + "' " + append_line_number(line)); | ||
} | ||
if (parsed.second) { | ||
break; | ||
} | ||
current_names.insert(parsed.first); | ||
} while (true); | ||
} | ||
|
||
if (line == max_line) { | ||
throw std::runtime_error("number of lines should fit in a 32-bit integer"); | ||
} | ||
++line; | ||
} | ||
|
||
return line; | ||
} | ||
|
||
} | ||
|
||
} | ||
|
||
#endif |
Oops, something went wrong.