From a6e4dc61c489c4de6942f1fb8fecce9182eae337 Mon Sep 17 00:00:00 2001 From: Kent Riemondy Date: Thu, 4 Aug 2022 14:01:03 -0600 Subject: [PATCH] ensure that first interval per group is part of cluster in bed_cluster. --- NEWS.md | 2 ++ src/merge.cpp | 16 +++++++++------- tests/testthat/test_cluster.r | 23 +++++++++++++++++++++++ 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index e257ec20..3d75d991 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # valr (development version) +* Fixed bug in handling the max_dist argument for first intervals in a contig with `bed_cluster()` (#388) + # valr 0.6.4 * Fixed intron score numbering error in `create_introns` (#377 @sheridar) diff --git a/src/merge.cpp b/src/merge.cpp index 49d35438..6a1d0d92 100644 --- a/src/merge.cpp +++ b/src/merge.cpp @@ -80,19 +80,21 @@ DataFrame clusterMergedIntervals(const ValrGroupedDataFrame& gdf, ListView idx(gdf.indices()) ; // approach from http://www.geeksforgeeks.org/merging-intervals/ - - std::vector s ; - for (int i = 0; i < ng; i++) { IntegerVector indices ; indices = idx[i]; + int ni = indices.size(); ivl_vector_t intervals = makeIntervalVector(df, indices); - // set dummy first interval to ensure first interval maintained + + // store a first interval to ensure first interval maintained ivl_t last_interval = ivl_t(0, 0, 0) ; + std::vector s ; s.push_back(last_interval) ; - for (auto it : intervals) { + + for (int j = 0; j < ni ; j++) { + ivl_t it = intervals[j]; // get index to store cluster ids in vector auto idx = it.value ; @@ -102,7 +104,7 @@ DataFrame clusterMergedIntervals(const ValrGroupedDataFrame& gdf, last_interval = it ; // set interval to compare auto top = s.back() ; // last good interval - if (top.stop + max_dist < it.start) { + if ( j == 0 || top.stop + max_dist < it.start) { // no overlap push to end of vector and get new id s.push_back(it) ; cluster_id++ ; @@ -115,7 +117,7 @@ DataFrame clusterMergedIntervals(const ValrGroupedDataFrame& gdf, top.stop = it.stop ; // update end position s.pop_back() ; // remove best end s.push_back(top) ; // update end interval - ids[idx] = cluster_id ; // assign cluster id at proper inde + ids[idx] = cluster_id ; // assign cluster id at proper index } else { diff --git a/tests/testthat/test_cluster.r b/tests/testthat/test_cluster.r index 984734ea..295f4f82 100644 --- a/tests/testthat/test_cluster.r +++ b/tests/testthat/test_cluster.r @@ -52,3 +52,26 @@ test_that("cluster ids are not repeated per group issue #171", { shared_ids <- intersect(chr1_ids, chr2_ids) expect_equal(length(shared_ids), 0) }) + + +test_that("guard against max_dist argument preventing clustering first interval in contig issue #388", { + x <- tibble::tribble( + ~chrom, ~start, ~end, + "a", 1, 10, + "a", 20, 50, + "b", 20, 50, + "c", 100, 100 + ) + + res <- bed_cluster(x, max_dist=0) + expect_equal(res$.id, 1L:4L) + + res <- bed_cluster(x, max_dist=100) + expect_equal(res$.id, c(1, 1, 2, 3)) + + res <- bed_cluster(x, max_dist=10) + expect_equal(res$.id, c(1, 1, 2, 3)) + + res <- bed_cluster(x, max_dist=9) + expect_equal(res$.id, 1L:4L) +})