Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vectorizing reduce and gap methods #36

Merged
merged 10 commits into from
Jun 21, 2024
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Changelog

## Version 0.2.8
## Version 0.2.10

- Added a numpy vectorized version of finding gaps (tldr: not fast compared to the traditional version). May be needs a better implementation
- Added NCLS based intersection operation (based on what pyranges does in their internals)
- Added tests for intersection operations.

## Version 0.2.8 - 0.2.9

Optimizing a couple of methods in `IRanges`:

Expand Down
127 changes: 115 additions & 12 deletions src/iranges/IRanges.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,9 @@ def __init__(
self._validate_metadata()

def _sanitize_start(self, start):
if isinstance(start, np.ndarray) and start.dtype == np.int32:
return start

return np.asarray(start, dtype=np.int32)

def _sanitize_width(self, width):
if isinstance(width, np.ndarray) and width.dtype == np.int32:
return width

return np.asarray(width, dtype=np.int32)

def _validate_width(self):
Expand Down Expand Up @@ -1044,6 +1038,77 @@ def gaps(self, start: Optional[int] = None, end: Optional[int] = None) -> "IRang

return IRanges(_gapstarts, _gapends)

def gaps_numpy(
self, start: Optional[int] = None, end: Optional[int] = None
) -> "IRanges":
"""Gaps returns an ``IRanges`` object representing the set of integers that remain after the intervals are
removed specified by the start and end arguments.

This function uses a vectorized approach using numpy vectors.
The normal :py:meth:`~.gaps` method performs better in most cases.

Args:
start:
Restrict start position. Defaults to 1.

end:
Restrict end position. Defaults to None.

Returns:
A new ``IRanges`` is with the gap regions.
"""
mask = self._width > 0
starts = self._start[mask]
widths = self._width[mask]

order = np.argsort(starts)
starts = starts[order]
widths = widths[order]
ends = starts + widths

gaps = np.r_[starts[1:] - ends[:-1], np.inf]
merge_mask = np.r_[True, gaps <= 0][:-1]
merge_groups = np.cumsum(~merge_mask)
unique_groups = np.unique(merge_groups)

result_starts = []
result_ends = []

first = merge_groups == unique_groups[0]
current_start = starts[first].min()
current_end = ends[first].max()

if start is not None and start < current_start:
result_starts.append(start)
result_ends.append(current_start)

for group in unique_groups[1:]:
group_mask = merge_groups == group
group_starts = starts[group_mask]
group_ends = ends[group_mask]

_start = group_starts.min()
_end = group_ends.max()

if _start - current_end > 0:
result_starts.append(current_end)
result_ends.append(_start)

current_start = _start
current_end = _end
else:
current_end = _end

if end is not None and end > current_end:
result_starts.append(current_end)
result_ends.append(end + 1)

result_starts = np.array(result_starts)
result_ends = np.array(result_ends)

result_widths = result_ends - result_starts
return IRanges(result_starts, result_widths)

# folows the same logic as in https://stackoverflow.com/questions/55480499/split-set-of-intervals-into-minimal-set-of-disjoint-intervals
# otherwise too much magic happening here - https://github.com/Bioconductor/IRanges/blob/5acb46b3f2805f7f74fe4cb746780e75f8257a83/R/inter-range-methods.R#L389
def disjoin(self, with_reverse_map: bool = False) -> "IRanges":
Expand Down Expand Up @@ -1640,6 +1705,48 @@ def intersect(self, other: "IRanges") -> "IRanges":

return _inter

# Inspired by pyranges intersection using NCLS
# https://github.com/pyranges/pyranges/blob/master/pyranges/methods/intersection.py
def intersect_ncls(self, other: "IRanges", delete_index: bool = True) -> "IRanges":
"""Find intersecting intervals with `other`. Uses the NCLS index.

Args:
other:
An `IRanges` object.

Raises:
TypeError:
If ``other`` is not `IRanges`.

Returns:
A new ``IRanges`` object with all intersecting intervals.
"""

other._build_ncls_index()

self_indexes, other_indexes = other._ncls.all_overlaps_both(
self.start, self.end, np.arange(len(self))
)

if delete_index:
other._delete_ncls_index()

self_new_starts = self.start[self_indexes]
other_new_starts = other.start[other_indexes]

new_starts = np.where(
self_new_starts > other_new_starts, self_new_starts, other_new_starts
)

self_new_ends = self.end[self_indexes]
other_new_ends = other.end[other_indexes]

new_ends = np.where(
self_new_ends < other_new_ends, self_new_ends, other_new_ends
)

return IRanges(new_starts, new_ends - new_starts).reduce()

############################
#### Overlap operations ####
############################
Expand All @@ -1651,9 +1758,7 @@ def _build_ncls_index(self):
from ncls import NCLS

if not hasattr(self, "_ncls"):
self._ncls = NCLS(
self.start, self.end, np.asarray([i for i in range(len(self))])
)
self._ncls = NCLS(self.start, self.end, np.arange(len(self)))

def _delete_ncls_index(self):
if hasattr(self, "_ncls"):
Expand All @@ -1673,9 +1778,7 @@ def _generic_find_hits(

new_starts = query._start - gap_start - 1
new_ends = query.end + gap_end + 1
_res = self._ncls.all_overlaps_both(
new_starts, new_ends, np.asarray(range(len(query)))
)
_res = self._ncls.all_overlaps_both(new_starts, new_ends, np.arange(len(query)))
all_overlaps = [[] for _ in range(len(query))]

for i in range(len(_res[0])):
Expand Down
17 changes: 17 additions & 0 deletions tests/test_set_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,20 @@ def test_intersect():
res = x.intersect(y)
assert all(np.equal(res.start, [-2, 6, 14]))
assert all(np.equal(res.width, [5, 3, 4]))

res = y.intersect(x)
assert all(np.equal(res.start, [-2, 6, 14]))
assert all(np.equal(res.width, [5, 3, 4]))


def test_intersect_ncls():
x = IRanges([1, 5, -2, 0, 14], [10, 5, 6, 12, 4])
y = IRanges([14, 0, -5, 6, 18], [7, 3, 8, 3, 3])

res = x.intersect_ncls(y)
assert all(np.equal(res.start, [-2, 6, 14]))
assert all(np.equal(res.width, [5, 3, 4]))

res = y.intersect_ncls(x)
assert all(np.equal(res.start, [-2, 6, 14]))
assert all(np.equal(res.width, [5, 3, 4]))
Loading