From 6fe159d8dd2460785b2383f3d2e558d825fc7a58 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Thu, 20 Jun 2024 22:12:19 -0700 Subject: [PATCH 1/9] vectorizing reduce and gap methods --- src/iranges/IRanges.py | 112 ++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 46 deletions(-) diff --git a/src/iranges/IRanges.py b/src/iranges/IRanges.py index 1ce3383..29f123c 100644 --- a/src/iranges/IRanges.py +++ b/src/iranges/IRanges.py @@ -115,15 +115,9 @@ def __init__( self._validate_metadata() def _sanitize_start(self, start): - if isinstance(start, np.ndarray) and start.dtype == np.int32: - return start - return np.asarray(start, dtype=np.int32) def _sanitize_width(self, width): - if isinstance(width, np.ndarray) and width.dtype == np.int32: - return width - return np.asarray(width, dtype=np.int32) def _validate_width(self): @@ -1001,48 +995,70 @@ def gaps(self, start: Optional[int] = None, end: Optional[int] = None) -> "IRang Returns: A new ``IRanges`` is with the gap regions. """ - out_ranges = [] - order_buf = self.order() + order = np.argsort(self._start) + starts = self._start[order] + widths = self._width[order] + ends = starts + widths + + gaps = np.r_[starts[1:] - ends[:-1], np.inf] + merge_mask = np.r_[True, gaps <= 0][:-1] + merge_groups = np.cumsum(~merge_mask) + unique_groups = np.unique(merge_groups) + + result_starts = [] + result_ends = [] + + for group in unique_groups: + group_mask = merge_groups == group + group_starts = starts[group_mask] + group_ends = ends[group_mask] + + _start = group_starts.min() + _end = group_ends.max() + + result_starts.append(_start) + result_ends.append(_end) + + result_starts = np.array(result_starts) + result_ends = np.array(result_ends) - 1 + + if len(result_starts) == 0: + if start is not None and end is not None: + return IRanges([start], [end - start + 1]) + return IRanges([], []) if start is not None: - max_end = start - 1 - else: - max_end = float("inf") + result_starts = np.maximum(result_starts, start) + if end is not None: + result_ends = np.minimum(result_ends, end) - for i in order_buf: - width_j = self._width[i] - if width_j == 0: - continue - start_j = self._start[i] - end_j = start_j + width_j - 1 + mask = result_starts <= result_ends + result_starts = result_starts[mask] + result_ends = result_ends[mask] - if max_end == float("inf"): - max_end = end_j - else: - gapstart = max_end + 1 - if end is not None and start_j > end + 1: - start_j = end + 1 - gapwidth = start_j - gapstart - if gapwidth >= 1: - out_ranges.append((gapstart, gapwidth)) - max_end = end_j - elif end_j > max_end: - max_end = end_j - - if end is not None and max_end >= end: - break - - if end is not None and max_end is not None and max_end < end: - gapstart = max_end + 1 - gapwidth = end - max_end - out_ranges.append((gapstart, gapwidth)) - - _gapstarts = [] - _gapends = [] - if len(out_ranges): - _gapstarts, _gapends = zip(*out_ranges) - - return IRanges(_gapstarts, _gapends) + if len(result_starts) == 0: + if start is not None and end is not None: + return IRanges([start], [end - start + 1]) + return IRanges([], []) + + gap_starts = result_ends[:-1] + 1 + gap_ends = result_starts[1:] + + # Add gaps at the beginning and end if necessary + if start is not None and start < result_starts[0]: + gap_starts = np.insert(gap_starts, 0, start) + gap_ends = np.insert(gap_ends, 0, result_starts[0]) + if end is not None and end > result_ends[-1]: + gap_starts = np.append(gap_starts, result_ends[-1] + 1) + gap_ends = np.append(gap_ends, end + 1) + + # Remove invalid gaps + valid_gaps = gap_ends > gap_starts + gap_starts = gap_starts[valid_gaps] + gap_ends = gap_ends[valid_gaps] + + gap_widths = gap_ends - gap_starts + return IRanges(gap_starts, gap_widths) # folows the same logic as in https://stackoverflow.com/questions/55480499/split-set-of-intervals-into-minimal-set-of-disjoint-intervals # otherwise too much magic happening here - https://github.com/Bioconductor/IRanges/blob/5acb46b3f2805f7f74fe4cb746780e75f8257a83/R/inter-range-methods.R#L389 @@ -1635,8 +1651,12 @@ def intersect(self, other: "IRanges") -> "IRanges": start = min(self.start.min(), other.start.min()) end = max(self.end.max(), other.end.max()) - _gaps = other.gaps(start=start, end=end) - _inter = self.setdiff(_gaps) + if len(other) > len(self): + _gaps = other.gaps(start=start, end=end) + _inter = self.setdiff(_gaps) + else: + _gaps = self.gaps(start=start, end=end) + _inter = other.setdiff(_gaps) return _inter From 020d6150e20d1567dda0c0b0a02018fbe890d81e Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Thu, 20 Jun 2024 22:57:45 -0700 Subject: [PATCH 2/9] may be better --- src/iranges/IRanges.py | 77 ++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 44 deletions(-) diff --git a/src/iranges/IRanges.py b/src/iranges/IRanges.py index 29f123c..a836726 100644 --- a/src/iranges/IRanges.py +++ b/src/iranges/IRanges.py @@ -995,9 +995,13 @@ def gaps(self, start: Optional[int] = None, end: Optional[int] = None) -> "IRang Returns: A new ``IRanges`` is with the gap regions. """ - order = np.argsort(self._start) - starts = self._start[order] - widths = self._width[order] + mask = self._width > 0 + starts = self._start[mask] + widths = self._width[mask] + + order = np.argsort(starts) + starts = starts[order] + widths = widths[order] ends = starts + widths gaps = np.r_[starts[1:] - ends[:-1], np.inf] @@ -1008,7 +1012,16 @@ def gaps(self, start: Optional[int] = None, end: Optional[int] = None) -> "IRang result_starts = [] result_ends = [] - for group in unique_groups: + first = merge_groups == unique_groups[0] + current_start = starts[first].min() + current_end = ends[first].max() + + if start is not None and start < current_start: + print("start", start) + result_starts.append(start) + result_ends.append(current_start) + + for group in unique_groups[1:]: group_mask = merge_groups == group group_starts = starts[group_mask] group_ends = ends[group_mask] @@ -1016,49 +1029,25 @@ def gaps(self, start: Optional[int] = None, end: Optional[int] = None) -> "IRang _start = group_starts.min() _end = group_ends.max() - result_starts.append(_start) - result_ends.append(_end) + if _start - current_end > 0: + result_starts.append(current_end) + result_ends.append(_start) - result_starts = np.array(result_starts) - result_ends = np.array(result_ends) - 1 + current_start = _start + current_end = _end + else: + current_end = _end - if len(result_starts) == 0: - if start is not None and end is not None: - return IRanges([start], [end - start + 1]) - return IRanges([], []) + if end is not None and end > current_end: + print("end", end) + result_starts.append(current_end) + result_ends.append(end + 1) - if start is not None: - result_starts = np.maximum(result_starts, start) - if end is not None: - result_ends = np.minimum(result_ends, end) - - mask = result_starts <= result_ends - result_starts = result_starts[mask] - result_ends = result_ends[mask] - - if len(result_starts) == 0: - if start is not None and end is not None: - return IRanges([start], [end - start + 1]) - return IRanges([], []) - - gap_starts = result_ends[:-1] + 1 - gap_ends = result_starts[1:] - - # Add gaps at the beginning and end if necessary - if start is not None and start < result_starts[0]: - gap_starts = np.insert(gap_starts, 0, start) - gap_ends = np.insert(gap_ends, 0, result_starts[0]) - if end is not None and end > result_ends[-1]: - gap_starts = np.append(gap_starts, result_ends[-1] + 1) - gap_ends = np.append(gap_ends, end + 1) - - # Remove invalid gaps - valid_gaps = gap_ends > gap_starts - gap_starts = gap_starts[valid_gaps] - gap_ends = gap_ends[valid_gaps] - - gap_widths = gap_ends - gap_starts - return IRanges(gap_starts, gap_widths) + result_starts = np.array(result_starts) + result_ends = np.array(result_ends) + + result_widths = result_ends - result_starts + return IRanges(result_starts, result_widths) # folows the same logic as in https://stackoverflow.com/questions/55480499/split-set-of-intervals-into-minimal-set-of-disjoint-intervals # otherwise too much magic happening here - https://github.com/Bioconductor/IRanges/blob/5acb46b3f2805f7f74fe4cb746780e75f8257a83/R/inter-range-methods.R#L389 From 034ee99ab67548b1446571867d5955cdc8bb5979 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 21 Jun 2024 07:46:56 -0700 Subject: [PATCH 3/9] add intersect using ncls --- src/iranges/IRanges.py | 48 ++++++++++++++++++++++++++++++------------ tests/test_set_ops.py | 9 ++++++++ 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/src/iranges/IRanges.py b/src/iranges/IRanges.py index a836726..8ba0f7a 100644 --- a/src/iranges/IRanges.py +++ b/src/iranges/IRanges.py @@ -1017,7 +1017,6 @@ def gaps(self, start: Optional[int] = None, end: Optional[int] = None) -> "IRang current_end = ends[first].max() if start is not None and start < current_start: - print("start", start) result_starts.append(start) result_ends.append(current_start) @@ -1039,7 +1038,6 @@ def gaps(self, start: Optional[int] = None, end: Optional[int] = None) -> "IRang current_end = _end if end is not None and end > current_end: - print("end", end) result_starts.append(current_end) result_ends.append(end + 1) @@ -1640,15 +1638,41 @@ def intersect(self, other: "IRanges") -> "IRanges": start = min(self.start.min(), other.start.min()) end = max(self.end.max(), other.end.max()) - if len(other) > len(self): - _gaps = other.gaps(start=start, end=end) - _inter = self.setdiff(_gaps) - else: - _gaps = self.gaps(start=start, end=end) - _inter = other.setdiff(_gaps) + _gaps = other.gaps(start=start, end=end) + _inter = self.setdiff(_gaps) return _inter + # Inspired by pyranges intersection using NCLS + # https://github.com/pyranges/pyranges/blob/master/pyranges/methods/intersection.py + def intersect_ncls(self, other: "IRanges", delete_index: bool = True) -> "IRanges": + # self._build_ncls_index() + + other._build_ncls_index() + + self_indexes, other_indexes = other._ncls.all_overlaps_both( + self.start, self.end, np.arange(len(self)) + ) + + if delete_index: + other._delete_ncls_index() + + self_new_starts = self.start[self_indexes] + other_new_starts = other.start[other_indexes] + + new_starts = np.where( + self_new_starts > other_new_starts, self_new_starts, other_new_starts + ) + + self_new_ends = self.end[self_indexes] + other_new_ends = other.end[other_indexes] + + new_ends = np.where( + self_new_ends < other_new_ends, self_new_ends, other_new_ends + ) + + return IRanges(new_starts, new_ends - new_starts).reduce() + ############################ #### Overlap operations #### ############################ @@ -1660,9 +1684,7 @@ def _build_ncls_index(self): from ncls import NCLS if not hasattr(self, "_ncls"): - self._ncls = NCLS( - self.start, self.end, np.asarray([i for i in range(len(self))]) - ) + self._ncls = NCLS(self.start, self.end, np.arange(len(self))) def _delete_ncls_index(self): if hasattr(self, "_ncls"): @@ -1682,9 +1704,7 @@ def _generic_find_hits( new_starts = query._start - gap_start - 1 new_ends = query.end + gap_end + 1 - _res = self._ncls.all_overlaps_both( - new_starts, new_ends, np.asarray(range(len(query))) - ) + _res = self._ncls.all_overlaps_both(new_starts, new_ends, np.arange(len(query))) all_overlaps = [[] for _ in range(len(query))] for i in range(len(_res[0])): diff --git a/tests/test_set_ops.py b/tests/test_set_ops.py index 21d90cd..121bd1a 100644 --- a/tests/test_set_ops.py +++ b/tests/test_set_ops.py @@ -32,3 +32,12 @@ def test_intersect(): res = x.intersect(y) assert all(np.equal(res.start, [-2, 6, 14])) assert all(np.equal(res.width, [5, 3, 4])) + +def test_intersect_ncls(): + x = IRanges([1, 5, -2, 0, 14], [10, 5, 6, 12, 4]) + y = IRanges([14, 0, -5, 6, 18], [7, 3, 8, 3, 3]) + + res = x.intersect_ncls(y) + print(res) + assert all(np.equal(res.start, [-2, 6, 14])) + assert all(np.equal(res.width, [5, 3, 4])) \ No newline at end of file From 5324f3d25c8a4ea4d049d163dfdbfe2e5b2d9419 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Jun 2024 14:47:05 +0000 Subject: [PATCH 4/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_set_ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_set_ops.py b/tests/test_set_ops.py index 121bd1a..2807069 100644 --- a/tests/test_set_ops.py +++ b/tests/test_set_ops.py @@ -33,6 +33,7 @@ def test_intersect(): assert all(np.equal(res.start, [-2, 6, 14])) assert all(np.equal(res.width, [5, 3, 4])) + def test_intersect_ncls(): x = IRanges([1, 5, -2, 0, 14], [10, 5, 6, 12, 4]) y = IRanges([14, 0, -5, 6, 18], [7, 3, 8, 3, 3]) @@ -40,4 +41,4 @@ def test_intersect_ncls(): res = x.intersect_ncls(y) print(res) assert all(np.equal(res.start, [-2, 6, 14])) - assert all(np.equal(res.width, [5, 3, 4])) \ No newline at end of file + assert all(np.equal(res.width, [5, 3, 4])) From 9c51cb9e471c12f02f8499621c49dfe4f70821f9 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 21 Jun 2024 08:05:57 -0700 Subject: [PATCH 5/9] fix tests --- src/iranges/IRanges.py | 59 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/iranges/IRanges.py b/src/iranges/IRanges.py index 8ba0f7a..7ca2977 100644 --- a/src/iranges/IRanges.py +++ b/src/iranges/IRanges.py @@ -985,6 +985,65 @@ def gaps(self, start: Optional[int] = None, end: Optional[int] = None) -> "IRang """Gaps returns an ``IRanges`` object representing the set of integers that remain after the intervals are removed specified by the start and end arguments. + Args: + start: + Restrict start position. Defaults to 1. + + end: + Restrict end position. Defaults to None. + + Returns: + A new ``IRanges`` is with the gap regions. + """ + out_ranges = [] + order_buf = self.order() + + if start is not None: + max_end = start - 1 + else: + max_end = float("inf") + + for i in order_buf: + width_j = self._width[i] + if width_j == 0: + continue + start_j = self._start[i] + end_j = start_j + width_j - 1 + + if max_end == float("inf"): + max_end = end_j + else: + gapstart = max_end + 1 + if end is not None and start_j > end + 1: + start_j = end + 1 + gapwidth = start_j - gapstart + if gapwidth >= 1: + out_ranges.append((gapstart, gapwidth)) + max_end = end_j + elif end_j > max_end: + max_end = end_j + + if end is not None and max_end >= end: + break + + if end is not None and max_end is not None and max_end < end: + gapstart = max_end + 1 + gapwidth = end - max_end + out_ranges.append((gapstart, gapwidth)) + + _gapstarts = [] + _gapends = [] + if len(out_ranges): + _gapstarts, _gapends = zip(*out_ranges) + + return IRanges(_gapstarts, _gapends) + + def gaps_numpy( + self, start: Optional[int] = None, end: Optional[int] = None + ) -> "IRanges": + """Gaps returns an ``IRanges`` object representing the set of integers that remain after the intervals are + removed specified by the start and end arguments. + Args: start: Restrict start position. Defaults to 1. From c36237a7be3dbdaecc841a9474c0e465ecbdbd60 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 21 Jun 2024 11:20:09 -0700 Subject: [PATCH 6/9] add tests for intersection --- tests/test_set_ops.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_set_ops.py b/tests/test_set_ops.py index 2807069..f778a33 100644 --- a/tests/test_set_ops.py +++ b/tests/test_set_ops.py @@ -33,12 +33,18 @@ def test_intersect(): assert all(np.equal(res.start, [-2, 6, 14])) assert all(np.equal(res.width, [5, 3, 4])) + res = y.intersect(x) + assert all(np.equal(res.start, [-2, 6, 14])) + assert all(np.equal(res.width, [5, 3, 4])) def test_intersect_ncls(): x = IRanges([1, 5, -2, 0, 14], [10, 5, 6, 12, 4]) y = IRanges([14, 0, -5, 6, 18], [7, 3, 8, 3, 3]) res = x.intersect_ncls(y) - print(res) assert all(np.equal(res.start, [-2, 6, 14])) assert all(np.equal(res.width, [5, 3, 4])) + + res = y.intersect_ncls(x) + assert all(np.equal(res.start, [-2, 6, 14])) + assert all(np.equal(res.width, [5, 3, 4])) \ No newline at end of file From e27a3238bb713ce352e1086e365a0be7432713d1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Jun 2024 18:20:18 +0000 Subject: [PATCH 7/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_set_ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_set_ops.py b/tests/test_set_ops.py index f778a33..17f0d19 100644 --- a/tests/test_set_ops.py +++ b/tests/test_set_ops.py @@ -37,6 +37,7 @@ def test_intersect(): assert all(np.equal(res.start, [-2, 6, 14])) assert all(np.equal(res.width, [5, 3, 4])) + def test_intersect_ncls(): x = IRanges([1, 5, -2, 0, 14], [10, 5, 6, 12, 4]) y = IRanges([14, 0, -5, 6, 18], [7, 3, 8, 3, 3]) @@ -47,4 +48,4 @@ def test_intersect_ncls(): res = y.intersect_ncls(x) assert all(np.equal(res.start, [-2, 6, 14])) - assert all(np.equal(res.width, [5, 3, 4])) \ No newline at end of file + assert all(np.equal(res.width, [5, 3, 4])) From f7eba2da25f91475125eaf59cfedaf2dcc9240a8 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 21 Jun 2024 11:28:55 -0700 Subject: [PATCH 8/9] update docstrings and changelog --- CHANGELOG.md | 8 +++++++- src/iranges/IRanges.py | 17 ++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db4a39c..82620f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,12 @@ # Changelog -## Version 0.2.8 +## Version 0.2.10 + +- Added a numpy vectorized version of finding gaps (tldr: not fast compared to the traditional version). May be needs a better implementation +- Added NCLS based intersection operation (based on what pyranges does in their internals) +- Added tests for intersection operations. + +## Version 0.2.8 - 0.2.9 Optimizing a couple of methods in `IRanges`: diff --git a/src/iranges/IRanges.py b/src/iranges/IRanges.py index 7ca2977..36b2770 100644 --- a/src/iranges/IRanges.py +++ b/src/iranges/IRanges.py @@ -1044,6 +1044,9 @@ def gaps_numpy( """Gaps returns an ``IRanges`` object representing the set of integers that remain after the intervals are removed specified by the start and end arguments. + This function uses a vectorized approach using numpy vectors. + The normal :py:meth:`~.gaps` method performs better in most cases. + Args: start: Restrict start position. Defaults to 1. @@ -1705,7 +1708,19 @@ def intersect(self, other: "IRanges") -> "IRanges": # Inspired by pyranges intersection using NCLS # https://github.com/pyranges/pyranges/blob/master/pyranges/methods/intersection.py def intersect_ncls(self, other: "IRanges", delete_index: bool = True) -> "IRanges": - # self._build_ncls_index() + """Find intersecting intervals with `other`. Uses the NCLS index. + + Args: + other: + An `IRanges` object. + + Raises: + TypeError: + If ``other`` is not `IRanges`. + + Returns: + A new ``IRanges`` object with all intersecting intervals. + """ other._build_ncls_index() From 286b5769a0dc7e9ee75dbc97cddf8b14fca79686 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Jun 2024 18:29:44 +0000 Subject: [PATCH 9/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 82620f0..95a7c1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ - Added a numpy vectorized version of finding gaps (tldr: not fast compared to the traditional version). May be needs a better implementation - Added NCLS based intersection operation (based on what pyranges does in their internals) -- Added tests for intersection operations. +- Added tests for intersection operations. ## Version 0.2.8 - 0.2.9