From 94916801cf9fdee83f8c01859756b6543d6844ee Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 19 Jun 2024 19:50:25 -0700 Subject: [PATCH] Optimize `gaps` and `reduce` methods (#34) Optimizing a couple of methods in `IRanges`: - Update `gaps` and `reduce` to slightly faster NumPy based operations. - Switch `np.array` to `np.asarray` --- CHANGELOG.md | 9 +- src/iranges/IRanges.py | 188 ++++++++++++++++++---------------------- src/iranges/interval.py | 22 +++-- 3 files changed, 102 insertions(+), 117 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef2c26b..db4a39c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## Version 0.2.8 + +Optimizing a couple of methods in `IRanges`: + +- Update `gaps` and `reduce` to slightly faster NumPy based operations. +- Switch `np.array` to `np.asarray` + ## Version 0.2.7 Changes to be compatible with NumPy's 2.0 release: @@ -9,7 +16,7 @@ Changes to be compatible with NumPy's 2.0 release: ## Version 0.2.4 -- Support coersion from `IRanges` to Polars and vice-versa +- Support coercion from `IRanges` to Polars and vice-versa - Setting up `myst_nb` to execute snippets in tutorial/documentation markdown files ## Version 0.2.3 diff --git a/src/iranges/IRanges.py b/src/iranges/IRanges.py index dd36b12..be52f29 100644 --- a/src/iranges/IRanges.py +++ b/src/iranges/IRanges.py @@ -739,7 +739,7 @@ def _sanitize_vec_argument( if isinstance(vec, int): return vec elif ut.is_list_of_type(vec, int, ignore_none=allow_none): - vec = np.array(vec) + vec = np.asarray(vec) if len(vec) < _size: raise ValueError("Provided argument must match the number of intervals.") @@ -852,7 +852,7 @@ def coverage( raise TypeError("'width' must be an integer or float.") if isinstance(width, (np.ndarray, list)): - width = max(width) + width = width.max() cov, _ = create_np_interval_vector(new_ranges, force_size=width, value=weight) return cov @@ -865,8 +865,8 @@ def range(self) -> "IRanges": the minimum of all the start positions, Maximum of all end positions. """ - min_start = min(self.start) - max_end = max(self.end) + min_start = self.start.min() + max_end = self.end.max() return IRanges([min_start], [max_end - min_start]) @@ -896,60 +896,43 @@ def reduce( if min_gap_width < 0: raise ValueError("'min_gap_width' cannot be negative.") - _order = self.order() + order = self.order() + starts = self._start[order] + widths = self._width[order] + ends = starts + widths - 1 + + if drop_empty_ranges: + valid_mask = widths > 0 + starts = starts[valid_mask] + ends = ends[valid_mask] + widths = widths[valid_mask] + order = np.array(order)[valid_mask] + + gaps = np.r_[starts[1:] - ends[:-1], np.inf] + merge_mask = np.r_[True, gaps <= min_gap_width][:-1] + merge_groups = np.cumsum(~merge_mask) + unique_groups = np.unique(merge_groups) result_starts = [] result_widths = [] result_revmaps = [] - counter = 0 - - def get_elem_counter(idx): - elem = self[idx] - start = elem.start[0] - end = elem.end[0] - 1 - width = elem.width[0] - return start, end, width - current_start, current_end, _ = get_elem_counter(_order[counter]) - current_revmaps = [_order[counter]] + for group in unique_groups: + group_mask = merge_groups == group + group_starts = starts[group_mask] + group_ends = ends[group_mask] + group_indices = order[group_mask] - counter += 1 - while counter < len(_order): - merge = False + start = group_starts.min() + end = group_ends.max() + width = end - start + 1 - o = _order[counter] - _idx_start, _idx_end, _idx_width = get_elem_counter(o) - _gap_width = _idx_start - current_end - - if _gap_width <= min_gap_width: - merge = True - - if merge is True: - if current_end < _idx_end: - current_end = _idx_end - - current_revmaps.append(o) - counter += 1 - else: - if not ( - drop_empty_ranges is True and current_end - current_start + 1 == 0 - ): - result_starts.append(current_start) - result_widths.append(current_end - current_start + 1) - result_revmaps.append(current_revmaps) - - current_revmaps = [o] - current_start = _idx_start - current_end = _idx_end - counter += 1 - - result_starts.append(current_start) - result_widths.append(current_end - current_start + 1) - result_revmaps.append(current_revmaps) + result_starts.append(start) + result_widths.append(width) + result_revmaps.append(group_indices.tolist()) result = IRanges(result_starts, result_widths) - - if with_reverse_map is True: + if with_reverse_map: result._mcols.set_column("revmap", result_revmaps, in_place=True) return result @@ -976,9 +959,14 @@ def order(self, decreasing: bool = False) -> np.ndarray: Returns: NumPy vector containing index positions in the sorted order. """ - intvals = sorted(self._get_intervals_as_list(), reverse=decreasing) - order = [o[2] for o in intvals] - return np.array(order) + order_buf = sorted( + range(len(self)), key=lambda i: (self._start[i], self._width[i]) + ) + + if decreasing: + return np.asarray(order_buf[::-1]) + + return np.asarray(order_buf) def sort(self, decreasing: bool = False, in_place: bool = False) -> "IRanges": """Sort the intervals. @@ -1013,52 +1001,44 @@ def gaps(self, start: Optional[int] = None, end: Optional[int] = None) -> "IRang Returns: A new ``IRanges`` is with the gap regions. """ - _order = self.order() - - overlap_start = min(self.start) if start is None else start - overlap_end = overlap_start - 1 if start is not None else None - - result_starts = [] - result_widths = [] + out_ranges = [] + order_buf = self.order() - def get_elem_counter(idx): - elem = self[idx] - start = elem.start[0] - end = elem.end[0] - 1 - width = elem.width[0] - return start, end, width - - for i in range(len(_order)): - _start, _end, _width = get_elem_counter(_order[i]) + if start is not None: + max_end = start - 1 + else: + max_end = float("inf") - if _width == 0: + for i in order_buf: + width_j = self._width[i] + if width_j == 0: continue + start_j = self._start[i] + end_j = start_j + width_j - 1 - if overlap_end is None: - overlap_end = _end + if max_end == float("inf"): + max_end = end_j else: - _gap_start = overlap_end + 1 - - if end is not None and _start > end + 1: - _start = end + 1 - - _gap_width = _start - _gap_start - - if _gap_width >= 1: - result_starts.append(_gap_start) - result_widths.append(_gap_width) - overlap_end = _end - elif _end > overlap_end: - overlap_end = _end - - if end is not None and overlap_end >= end: + gapstart = max_end + 1 + if end is not None and start_j > end + 1: + start_j = end + 1 + gapwidth = start_j - gapstart + if gapwidth >= 1: + out_ranges.append((gapstart, gapwidth)) + max_end = end_j + elif end_j > max_end: + max_end = end_j + + if end is not None and max_end >= end: break - if end is not None and overlap_end is not None and overlap_end < end: - result_starts.append(overlap_end + 1) - result_widths.append(end - overlap_end) + if end is not None and max_end is not None and max_end < end: + gapstart = max_end + 1 + gapwidth = end - max_end + out_ranges.append((gapstart, gapwidth)) - return IRanges(result_starts, result_widths) + _gapstarts, _gapends = zip(*out_ranges) + return IRanges(_gapstarts, _gapends) # folows the same logic as in https://stackoverflow.com/questions/55480499/split-set-of-intervals-into-minimal-set-of-disjoint-intervals # otherwise too much magic happening here - https://github.com/Bioconductor/IRanges/blob/5acb46b3f2805f7f74fe4cb746780e75f8257a83/R/inter-range-methods.R#L389 @@ -1254,8 +1234,8 @@ def narrow( counter += 1 - output._start = np.array(new_starts) - output._width = np.array(new_widths) + output._start = np.asarray(new_starts) + output._width = np.asarray(new_widths) return output def resize( @@ -1330,7 +1310,7 @@ def resize( counter += 1 output = self._define_output(in_place) - output._start = np.array(new_starts) + output._start = np.asarray(new_starts) output._width = ( np.repeat(_awidth, len(self)) if isinstance(_awidth, int) else _awidth ) @@ -1571,7 +1551,7 @@ def overlap_indices( counter += 1 - return np.array(overlaps) + return np.asarray(overlaps) ######################## #### set operations #### @@ -1621,8 +1601,8 @@ def setdiff(self, other: "IRanges") -> "IRanges": if not isinstance(other, IRanges): raise TypeError("'other' is not an `IRanges` object.") - start = min(min(self.start), min(other.start)) - end = max(max(self.end), max(other.end)) + start = min(self.start.min(), other.start.min()) + end = max(self.end.max(), other.end.max()) x_gaps = self.gaps(start=start, end=end) x_gaps_u = x_gaps.union(other) @@ -1648,8 +1628,8 @@ def intersect(self, other: "IRanges") -> "IRanges": if not isinstance(other, IRanges): raise TypeError("'other' is not an `IRanges` object.") - start = min(min(self.start), min(other.start)) - end = max(max(self.end), max(other.end)) + start = min(self.start.min(), other.start.min()) + end = max(self.end.max(), other.end.max()) _gaps = other.gaps(start=start, end=end) _inter = self.setdiff(_gaps) @@ -1668,7 +1648,7 @@ def _build_ncls_index(self): if not hasattr(self, "_ncls"): self._ncls = NCLS( - self.start, self.end, np.array([i for i in range(len(self))]) + self.start, self.end, np.asarray([i for i in range(len(self))]) ) def _delete_ncls_index(self): @@ -1690,7 +1670,7 @@ def _generic_find_hits( new_starts = query._start - gap_start - 1 new_ends = query.end + gap_end + 1 _res = self._ncls.all_overlaps_both( - new_starts, new_ends, np.array(range(len(query))) + new_starts, new_ends, np.asarray(range(len(query))) ) all_overlaps = [[] for _ in range(len(query))] @@ -1842,7 +1822,7 @@ def count_overlaps( min_overlap=min_overlap, delete_index=delete_index, ) - return np.array([len(x) for x in _overlaps]) + return np.asarray([len(x) for x in _overlaps]) def subset_by_overlaps( self, @@ -1909,8 +1889,8 @@ def _generic_search( select, delete_index=False, ): - min_start = min(self.start) - max_end = max(self.end) + min_start = self.start.min() + max_end = self.end.max() hits = [] for _, val in query: _iterate = True @@ -2112,7 +2092,7 @@ def distance(self, query: "IRanges") -> np.ndarray: all_distances.append(distance) - return np.array(all_distances) + return np.asarray(all_distances) ######################## #### pandas interop #### diff --git a/src/iranges/interval.py b/src/iranges/interval.py index 8922a87..b7294ee 100644 --- a/src/iranges/interval.py +++ b/src/iranges/interval.py @@ -41,7 +41,7 @@ def create_np_interval_vector( max_end = force_size if max_end is None: - max_end = max(intervals.get_end()) + max_end = intervals.get_end().max() else: max_end += 1 @@ -89,16 +89,14 @@ def calc_gap_and_overlap( Interval containing start and end positions. `end` is non-inclusive. """ - _gap = None - _overlap = None - - if first[0] < second[1] and first[1] > second[0]: + if min(first[1], second[1]) > max(first[0], second[0]): _overlap = min(first[1], second[1]) - max(first[0], second[0]) - else: - _gap = None - if second[0] >= first[1]: - _gap = second[0] - first[1] - elif first[0] >= second[1]: - _gap = first[0] - second[1] + return (None, _overlap) + + _gap = None + if second[0] >= first[1]: + _gap = second[0] - first[1] + elif first[0] >= second[1]: + _gap = first[0] - second[1] - return (_gap, _overlap) + return (_gap, None)