Skip to content

Commit

Permalink
add genomecov (#411)
Browse files Browse the repository at this point in the history
* add genomecov functionality, closes #410

* change column output, depth -> .depth, for consistency with other valr outputs

* styler::style_pkg()

* fix note
  • Loading branch information
kriemo authored Oct 11, 2023
1 parent 69a2b8f commit a4093ef
Show file tree
Hide file tree
Showing 20 changed files with 561 additions and 4 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export(bed_complement)
export(bed_coverage)
export(bed_fisher)
export(bed_flank)
export(bed_genomecov)
export(bed_glyph)
export(bed_intersect)
export(bed_jaccard)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# valr (development version)

* Added `bed_genomecov()` to compute interval coverage across a genome.

# valr 0.7.0

* `read_bed` and related functions now automatically calculate the fields. Use of `n_fields` was deprecated.
Expand Down
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ flank_impl <- function(df, genome, both = 0, left = 0, right = 0, fraction = FAL
.Call(`_valr_flank_impl`, df, genome, both, left, right, fraction, stranded, trim)
}

gcoverage_impl <- function(gdf, max_coords) {
.Call(`_valr_gcoverage_impl`, gdf, max_coords)
}

intersect_impl <- function(x, y, x_grp_indexes, y_grp_indexes, invert = FALSE, suffix_x = ".x", suffix_y = ".y") {
.Call(`_valr_intersect_impl`, x, y, x_grp_indexes, y_grp_indexes, invert, suffix_x, suffix_y)
}
Expand Down
121 changes: 121 additions & 0 deletions R/bed_genomecov.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#' Calculate coverage across a genome
#'
#' This function is useful for calculating interval coverage across an entire genome.
#'
#' @param x [ivl_df]
#' @param genome [genome_df]
#' @param zero_depth If TRUE, report intervals with zero depth. Zero depth intervals will
#' be reported with respect to groups.
#'
#' @template groups
#'
#' @return
#' [ivl_df] with the an additional column:
#'
#' - `.depth` depth of interval coverage
#'
#' @family single set operations
#'
#' @seealso
#' \url{https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html}
#'
#' @examples
#' x <- tibble::tribble(
#' ~chrom, ~start, ~end, ~strand,
#' "chr1", 20, 70, "+",
#' "chr1", 50, 100, "-",
#' "chr1", 200, 250, "+",
#' "chr1", 220, 250, "+"
#' )
#'
#' genome <- tibble::tribble(
#' ~chrom, ~size,
#' "chr1", 500,
#' "chr2", 1000
#' )
#'
#' bed_genomecov(x, genome)
#'
#' bed_genomecov(dplyr::group_by(x, strand), genome)
#'
#' bed_genomecov(dplyr::group_by(x, strand), genome, zero_depth = TRUE)
#'
#' @export
bed_genomecov <- function(x, genome, zero_depth = FALSE) {
check_required(x)
check_required(genome)

x <- check_interval(x)
genome <- check_genome(genome)

non_genome_chroms <- setdiff(unique(x$chrom), genome$chrom)
if (length(non_genome_chroms) > 0) {
cli::cli_warn(c(
paste0(
"The following chromosomes in bed intervals are not",
" in the genome and will be ignored:"
),
paste0(
non_genome_chroms,
sep = "\n"
)
))
x <- x[!x[["chrom"]] %in% non_genome_chroms, ]
}

grp_cols <- group_vars(x)
x <- bed_sort(x)

groups <- rlang::syms(unique(c("chrom", grp_cols)))
x <- group_by(x, !!!groups)

max_coords <- group_chrom_sizes(x, genome)

res <- gcoverage_impl(x, max_coords)
res <- tibble::as_tibble(res)

# drop non-grouped cols as values no longer match ivls
res <- select(res, chrom, start, end, one_of(grp_cols), .depth)

if (!zero_depth) {
res <- res[res[[".depth"]] > 0, ]
} else {
# handle any missing chromosome, zero depth intervals handled on cpp side
missing_chroms <- setdiff(genome$chrom, group_data(x)$chrom)
if (length(missing_chroms) > 0) {
missing_chrom_ivls <- genome[genome[["chrom"]] %in% missing_chroms, ]
missing_chrom_ivls[["start"]] <- 0L
missing_chrom_ivls[[".depth"]] <- 0L
missing_chrom_ivls <- select(missing_chrom_ivls, chrom, start, end = size, .depth)

if (length(groups) > 1) {
missing_chrom_ivls <- fill_missing_grouping(missing_chrom_ivls, x)
}
res <- bind_rows(res, missing_chrom_ivls)
res <- bed_sort(res)
}
}

res
}

# return chrom sizes for each group
group_chrom_sizes <- function(x, genome) {
xx <- left_join(group_data(x), genome, by = "chrom")
xx[["size"]]
}

# expand df to contain non-chrom groups from grp_df
fill_missing_grouping <- function(df, grp_df) {
grps <- get_group_data(grp_df)
grps <- grps[, setdiff(colnames(grps), "chrom")]

# expand df rows for new groups
df_grown <- df[rep(seq_len(nrow(df)), nrow(grps)), ]
# expand groups df for new groups
grp_grown <- grps[rep(seq_len(nrow(grps)), nrow(df)), ]

stopifnot(nrow(df_grown) == nrow(grp_grown))
res <- bind_cols(df_grown, grp_grown)
res
}
2 changes: 1 addition & 1 deletion R/globals.r
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ globalVariables(c(
"p.value", ".win_size", ".row_id",
"cds_start", "cds_end", "score",
"n_int", "sum_overlap", "sum_x", "sum_xy", "sum_y",
"temp_start", "temp_end", "seqnames"
"temp_start", "temp_end", "seqnames", ".depth"
))
3 changes: 3 additions & 0 deletions bench/benchmarks.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ res <- mark(
bed_partition(x),
bed_cluster(x),
bed_complement(x, genome),
bed_genomecov(x, genome),
# multi tbl functions
bed_closest(x, y),
bed_intersect(x, y),
Expand Down Expand Up @@ -141,6 +142,7 @@ res2 <- mark(
bed_subtract(x, y),
bed_makewindows(x, win_size = 100),
bed_sort(z),
bed_genomecov(x, genome),
# GRanges
flank(x_gr, 1000),
shift(x_gr,0),
Expand All @@ -151,6 +153,7 @@ res2 <- mark(
setdiff(x_gr, y_gr),
tile(x_gr, width = 100),
sort(z_gr),
coverage(x_gr),
min_time = Inf,
iterations = nrep,
check = FALSE,
Expand Down
1 change: 1 addition & 0 deletions man/bed_cluster.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_complement.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_flank.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

68 changes: 68 additions & 0 deletions man/bed_genomecov.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_merge.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_partition.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_shift.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/bed_slop.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkgdown/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ reference:
- bed_cluster
- bed_complement
- bed_flank
- bed_genomecov
- bed_merge
- bed_partition
- bed_slop
Expand Down
12 changes: 12 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,18 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// gcoverage_impl
DataFrame gcoverage_impl(const ValrGroupedDataFrame& gdf, const IntegerVector& max_coords);
RcppExport SEXP _valr_gcoverage_impl(SEXP gdfSEXP, SEXP max_coordsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const ValrGroupedDataFrame& >::type gdf(gdfSEXP);
Rcpp::traits::input_parameter< const IntegerVector& >::type max_coords(max_coordsSEXP);
rcpp_result_gen = Rcpp::wrap(gcoverage_impl(gdf, max_coords));
return rcpp_result_gen;
END_RCPP
}
// intersect_impl
DataFrame intersect_impl(ValrGroupedDataFrame x, ValrGroupedDataFrame y, IntegerVector x_grp_indexes, IntegerVector y_grp_indexes, bool invert, const std::string& suffix_x, const std::string& suffix_y);
RcppExport SEXP _valr_intersect_impl(SEXP xSEXP, SEXP ySEXP, SEXP x_grp_indexesSEXP, SEXP y_grp_indexesSEXP, SEXP invertSEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP) {
Expand Down
Loading

0 comments on commit a4093ef

Please sign in to comment.