Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizing a few operations #34

Merged
merged 5 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## Version 0.2.8

Optimizing a couple of methods in `IRanges`:

- Update `gaps` and `reduce` to slightly faster NumPy based operations.
- Switch `np.array` to `np.asarray`

## Version 0.2.7

Changes to be compatible with NumPy's 2.0 release:
Expand All @@ -9,7 +16,7 @@ Changes to be compatible with NumPy's 2.0 release:

## Version 0.2.4

- Support coersion from `IRanges` to Polars and vice-versa
- Support coercion from `IRanges` to Polars and vice-versa
- Setting up `myst_nb` to execute snippets in tutorial/documentation markdown files

## Version 0.2.3
Expand Down
188 changes: 84 additions & 104 deletions src/iranges/IRanges.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ def _sanitize_vec_argument(
if isinstance(vec, int):
return vec
elif ut.is_list_of_type(vec, int, ignore_none=allow_none):
vec = np.array(vec)
vec = np.asarray(vec)

if len(vec) < _size:
raise ValueError("Provided argument must match the number of intervals.")
Expand Down Expand Up @@ -852,7 +852,7 @@ def coverage(
raise TypeError("'width' must be an integer or float.")

if isinstance(width, (np.ndarray, list)):
width = max(width)
width = width.max()

cov, _ = create_np_interval_vector(new_ranges, force_size=width, value=weight)
return cov
Expand All @@ -865,8 +865,8 @@ def range(self) -> "IRanges":
the minimum of all the start positions, Maximum of all end positions.
"""

min_start = min(self.start)
max_end = max(self.end)
min_start = self.start.min()
max_end = self.end.max()

return IRanges([min_start], [max_end - min_start])

Expand Down Expand Up @@ -896,60 +896,43 @@ def reduce(
if min_gap_width < 0:
raise ValueError("'min_gap_width' cannot be negative.")

_order = self.order()
order = self.order()
starts = self._start[order]
widths = self._width[order]
ends = starts + widths - 1

if drop_empty_ranges:
valid_mask = widths > 0
starts = starts[valid_mask]
ends = ends[valid_mask]
widths = widths[valid_mask]
order = np.array(order)[valid_mask]

gaps = np.r_[starts[1:] - ends[:-1], np.inf]
merge_mask = np.r_[True, gaps <= min_gap_width][:-1]
merge_groups = np.cumsum(~merge_mask)
unique_groups = np.unique(merge_groups)

result_starts = []
result_widths = []
result_revmaps = []
counter = 0

def get_elem_counter(idx):
elem = self[idx]
start = elem.start[0]
end = elem.end[0] - 1
width = elem.width[0]
return start, end, width

current_start, current_end, _ = get_elem_counter(_order[counter])
current_revmaps = [_order[counter]]
for group in unique_groups:
group_mask = merge_groups == group
group_starts = starts[group_mask]
group_ends = ends[group_mask]
group_indices = order[group_mask]

counter += 1
while counter < len(_order):
merge = False
start = group_starts.min()
end = group_ends.max()
width = end - start + 1

o = _order[counter]
_idx_start, _idx_end, _idx_width = get_elem_counter(o)
_gap_width = _idx_start - current_end

if _gap_width <= min_gap_width:
merge = True

if merge is True:
if current_end < _idx_end:
current_end = _idx_end

current_revmaps.append(o)
counter += 1
else:
if not (
drop_empty_ranges is True and current_end - current_start + 1 == 0
):
result_starts.append(current_start)
result_widths.append(current_end - current_start + 1)
result_revmaps.append(current_revmaps)

current_revmaps = [o]
current_start = _idx_start
current_end = _idx_end
counter += 1

result_starts.append(current_start)
result_widths.append(current_end - current_start + 1)
result_revmaps.append(current_revmaps)
result_starts.append(start)
result_widths.append(width)
result_revmaps.append(group_indices.tolist())

result = IRanges(result_starts, result_widths)

if with_reverse_map is True:
if with_reverse_map:
result._mcols.set_column("revmap", result_revmaps, in_place=True)

return result
Expand All @@ -976,9 +959,14 @@ def order(self, decreasing: bool = False) -> np.ndarray:
Returns:
NumPy vector containing index positions in the sorted order.
"""
intvals = sorted(self._get_intervals_as_list(), reverse=decreasing)
order = [o[2] for o in intvals]
return np.array(order)
order_buf = sorted(
range(len(self)), key=lambda i: (self._start[i], self._width[i])
)

if decreasing:
return np.asarray(order_buf[::-1])

return np.asarray(order_buf)

def sort(self, decreasing: bool = False, in_place: bool = False) -> "IRanges":
"""Sort the intervals.
Expand Down Expand Up @@ -1013,52 +1001,44 @@ def gaps(self, start: Optional[int] = None, end: Optional[int] = None) -> "IRang
Returns:
A new ``IRanges`` is with the gap regions.
"""
_order = self.order()

overlap_start = min(self.start) if start is None else start
overlap_end = overlap_start - 1 if start is not None else None

result_starts = []
result_widths = []
out_ranges = []
order_buf = self.order()

def get_elem_counter(idx):
elem = self[idx]
start = elem.start[0]
end = elem.end[0] - 1
width = elem.width[0]
return start, end, width

for i in range(len(_order)):
_start, _end, _width = get_elem_counter(_order[i])
if start is not None:
max_end = start - 1
else:
max_end = float("inf")

if _width == 0:
for i in order_buf:
width_j = self._width[i]
if width_j == 0:
continue
start_j = self._start[i]
end_j = start_j + width_j - 1

if overlap_end is None:
overlap_end = _end
if max_end == float("inf"):
max_end = end_j
else:
_gap_start = overlap_end + 1

if end is not None and _start > end + 1:
_start = end + 1

_gap_width = _start - _gap_start

if _gap_width >= 1:
result_starts.append(_gap_start)
result_widths.append(_gap_width)
overlap_end = _end
elif _end > overlap_end:
overlap_end = _end

if end is not None and overlap_end >= end:
gapstart = max_end + 1
if end is not None and start_j > end + 1:
start_j = end + 1
gapwidth = start_j - gapstart
if gapwidth >= 1:
out_ranges.append((gapstart, gapwidth))
max_end = end_j
elif end_j > max_end:
max_end = end_j

if end is not None and max_end >= end:
break

if end is not None and overlap_end is not None and overlap_end < end:
result_starts.append(overlap_end + 1)
result_widths.append(end - overlap_end)
if end is not None and max_end is not None and max_end < end:
gapstart = max_end + 1
gapwidth = end - max_end
out_ranges.append((gapstart, gapwidth))

return IRanges(result_starts, result_widths)
_gapstarts, _gapends = zip(*out_ranges)
return IRanges(_gapstarts, _gapends)

# folows the same logic as in https://stackoverflow.com/questions/55480499/split-set-of-intervals-into-minimal-set-of-disjoint-intervals
# otherwise too much magic happening here - https://github.com/Bioconductor/IRanges/blob/5acb46b3f2805f7f74fe4cb746780e75f8257a83/R/inter-range-methods.R#L389
Expand Down Expand Up @@ -1254,8 +1234,8 @@ def narrow(

counter += 1

output._start = np.array(new_starts)
output._width = np.array(new_widths)
output._start = np.asarray(new_starts)
output._width = np.asarray(new_widths)
return output

def resize(
Expand Down Expand Up @@ -1330,7 +1310,7 @@ def resize(
counter += 1

output = self._define_output(in_place)
output._start = np.array(new_starts)
output._start = np.asarray(new_starts)
output._width = (
np.repeat(_awidth, len(self)) if isinstance(_awidth, int) else _awidth
)
Expand Down Expand Up @@ -1571,7 +1551,7 @@ def overlap_indices(

counter += 1

return np.array(overlaps)
return np.asarray(overlaps)

########################
#### set operations ####
Expand Down Expand Up @@ -1621,8 +1601,8 @@ def setdiff(self, other: "IRanges") -> "IRanges":
if not isinstance(other, IRanges):
raise TypeError("'other' is not an `IRanges` object.")

start = min(min(self.start), min(other.start))
end = max(max(self.end), max(other.end))
start = min(self.start.min(), other.start.min())
end = max(self.end.max(), other.end.max())

x_gaps = self.gaps(start=start, end=end)
x_gaps_u = x_gaps.union(other)
Expand All @@ -1648,8 +1628,8 @@ def intersect(self, other: "IRanges") -> "IRanges":
if not isinstance(other, IRanges):
raise TypeError("'other' is not an `IRanges` object.")

start = min(min(self.start), min(other.start))
end = max(max(self.end), max(other.end))
start = min(self.start.min(), other.start.min())
end = max(self.end.max(), other.end.max())

_gaps = other.gaps(start=start, end=end)
_inter = self.setdiff(_gaps)
Expand All @@ -1668,7 +1648,7 @@ def _build_ncls_index(self):

if not hasattr(self, "_ncls"):
self._ncls = NCLS(
self.start, self.end, np.array([i for i in range(len(self))])
self.start, self.end, np.asarray([i for i in range(len(self))])
)

def _delete_ncls_index(self):
Expand All @@ -1690,7 +1670,7 @@ def _generic_find_hits(
new_starts = query._start - gap_start - 1
new_ends = query.end + gap_end + 1
_res = self._ncls.all_overlaps_both(
new_starts, new_ends, np.array(range(len(query)))
new_starts, new_ends, np.asarray(range(len(query)))
)
all_overlaps = [[] for _ in range(len(query))]

Expand Down Expand Up @@ -1842,7 +1822,7 @@ def count_overlaps(
min_overlap=min_overlap,
delete_index=delete_index,
)
return np.array([len(x) for x in _overlaps])
return np.asarray([len(x) for x in _overlaps])

def subset_by_overlaps(
self,
Expand Down Expand Up @@ -1909,8 +1889,8 @@ def _generic_search(
select,
delete_index=False,
):
min_start = min(self.start)
max_end = max(self.end)
min_start = self.start.min()
max_end = self.end.max()
hits = []
for _, val in query:
_iterate = True
Expand Down Expand Up @@ -2112,7 +2092,7 @@ def distance(self, query: "IRanges") -> np.ndarray:

all_distances.append(distance)

return np.array(all_distances)
return np.asarray(all_distances)

########################
#### pandas interop ####
Expand Down
22 changes: 10 additions & 12 deletions src/iranges/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def create_np_interval_vector(

max_end = force_size
if max_end is None:
max_end = max(intervals.get_end())
max_end = intervals.get_end().max()
else:
max_end += 1

Expand Down Expand Up @@ -89,16 +89,14 @@ def calc_gap_and_overlap(
Interval containing start and end positions.
`end` is non-inclusive.
"""
_gap = None
_overlap = None

if first[0] < second[1] and first[1] > second[0]:
if min(first[1], second[1]) > max(first[0], second[0]):
_overlap = min(first[1], second[1]) - max(first[0], second[0])
else:
_gap = None
if second[0] >= first[1]:
_gap = second[0] - first[1]
elif first[0] >= second[1]:
_gap = first[0] - second[1]
return (None, _overlap)

_gap = None
if second[0] >= first[1]:
_gap = second[0] - first[1]
elif first[0] >= second[1]:
_gap = first[0] - second[1]

return (_gap, _overlap)
return (_gap, None)
Loading