Skip to content

Commit

Permalink
Strand-awareness for search and overlap methods (#114)
Browse files Browse the repository at this point in the history
- Use accessor methods to access properties especially `get_seqnames()`
- Modify search and overlap methods for strand-awareness.
- Update tests and documentation.
  • Loading branch information
jkanche authored Jul 16, 2024
1 parent 258fcdf commit 1ab20d9
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 39 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# Changelog

## Version 0.4.27
## Version 0.4.27 - 0.4.28

- Implement `subtract` method, add tests.
- Use accessor methods to access properties especially `get_seqnames()`
- Modify search and overlap methods for strand-awareness.
- Update tests and documentation.

## Version 0.4.26

Expand Down
170 changes: 140 additions & 30 deletions src/genomicranges/GenomicRanges.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,9 @@ def __init__(
)

def _build_reverse_seqindex(self, seqinfo: SeqInfo):
self._reverse_seqindex = ut.reverse_index.build_reverse_index(seqinfo.seqnames)
self._reverse_seqindex = ut.reverse_index.build_reverse_index(
seqinfo.get_seqnames()
)

def _remove_reverse_seqindex(self):
del self._reverse_seqindex
Expand Down Expand Up @@ -354,7 +356,7 @@ def __str__(self) -> str:
header = ["seqnames", "<str>"]
_seqnames = []
for x in self._seqnames[indices]:
_seqnames.append(self._seqinfo.seqnames[x])
_seqnames.append(self._seqinfo.get_seqnames()[x])

showed = _seqnames
if insert_ellipsis:
Expand Down Expand Up @@ -407,7 +409,7 @@ def __str__(self) -> str:
+ str(len(self._seqinfo))
+ " sequences): "
+ ut.print_truncated_list(
self._seqinfo.seqnames,
self._seqinfo.get_seqnames(),
sep=" ",
include_brackets=False,
transform=lambda y: y,
Expand Down Expand Up @@ -453,9 +455,9 @@ def get_seqnames(
"""

if as_type == "factor":
return self._seqnames, self._seqinfo.seqnames
return self._seqnames, self._seqinfo.get_seqnames()
elif as_type == "list":
return [self._seqinfo.seqnames[x] for x in self._seqnames]
return [self._seqinfo.get_seqnames()[x] for x in self._seqnames]
else:
raise ValueError("Argument 'as_type' must be 'factor' or 'list'.")

Expand All @@ -481,7 +483,7 @@ def set_seqnames(

if not isinstance(seqnames, np.ndarray):
seqnames = np.asarray(
[self._seqinfo.seqnames.index(x) for x in list(seqnames)]
[self._seqinfo.get_seqnames().index(x) for x in list(seqnames)]
)

output = self._define_output(in_place)
Expand Down Expand Up @@ -570,7 +572,7 @@ def get_strand(
vector is retuned.
If ``factor``, a tuple width levels as a dictionary and
indices to ``seqinfo.seqnames`` is returned.
indices to ``seqinfo.get_seqnames()`` is returned.
If ``list``, then codes are mapped to levels and returned.
Expand Down Expand Up @@ -1684,7 +1686,7 @@ def range(
rev_map = []
groups = []

for seq in self._seqinfo.seqnames:
for seq in self._seqinfo.get_seqnames():
_iter_strands = [0] if ignore_strand is True else [1, -1, 0]
for strd in _iter_strands:
_key = f"{seq}{_granges_delim}{strd}"
Expand Down Expand Up @@ -1738,7 +1740,7 @@ def gaps(
all_grp_ranges = []
groups = []

for i, chrm in enumerate(self._seqinfo.seqnames):
for i, chrm in enumerate(self._seqinfo.get_seqnames()):
_iter_strands = [0] if ignore_strand is True else [1, -1, 0]
for strd in _iter_strands:
_key = f"{chrm}{_granges_delim}{strd}"
Expand Down Expand Up @@ -1801,7 +1803,7 @@ def disjoin(
rev_map = []
groups = []

for seq in self._seqinfo.seqnames:
for seq in self._seqinfo.get_seqnames():
_iter_strands = [0] if ignore_strand is True else [1, -1, 0]
for strd in _iter_strands:
_key = f"{seq}{_granges_delim}{strd}"
Expand Down Expand Up @@ -1930,7 +1932,7 @@ def setdiff(self, other: "GenomicRanges") -> "GenomicRanges":
range_bounds = all_combined.range(ignore_strand=True)
rb_ends = {}
for _, val in range_bounds:
rb_ends[val.seqnames[0]] = val.end[0]
rb_ends[val.get_seqnames()[0]] = val.end[0]

x_gaps = self.gaps(end=rb_ends)
x_gaps_u = x_gaps.union(other)
Expand Down Expand Up @@ -1960,7 +1962,7 @@ def intersect(self, other: "GenomicRanges") -> "GenomicRanges":
range_bounds = all_combined.range(ignore_strand=True)
rb_ends = {}
for _, val in range_bounds:
rb_ends[val.seqnames[0]] = val.end[0]
rb_ends[val.get_seqnames()[0]] = val.end[0]

_gaps = other.gaps(end=rb_ends)
# _inter = self.setdiff(_gaps)
Expand Down Expand Up @@ -2098,8 +2100,26 @@ def find_overlaps(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.find_overlaps(
Expand All @@ -2112,7 +2132,7 @@ def find_overlaps(
)

for j, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map[indices[j]] = _rev_map

return rev_map
Expand Down Expand Up @@ -2172,8 +2192,26 @@ def count_overlaps(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.find_overlaps(
Expand All @@ -2186,7 +2224,7 @@ def count_overlaps(
)

for j, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map[indices[j]] = len(_rev_map)

return rev_map
Expand Down Expand Up @@ -2246,8 +2284,26 @@ def subset_by_overlaps(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.find_overlaps(
Expand All @@ -2260,7 +2316,7 @@ def subset_by_overlaps(
)

for _, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map.extend(_rev_map)

return self[list(set(rev_map))]
Expand Down Expand Up @@ -2300,16 +2356,34 @@ def nearest(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.nearest(
query=_query_subset._ranges, select=select, delete_index=False
)

for j, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map[indices[j]] = _rev_map

return rev_map
Expand Down Expand Up @@ -2345,16 +2419,34 @@ def precede(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.precede(
query=_query_subset._ranges, select=select, delete_index=False
)

for j, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map[indices[j]] = _rev_map

return rev_map
Expand Down Expand Up @@ -2390,16 +2482,34 @@ def follow(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.follow(
query=_query_subset._ranges, select=select, delete_index=False
)

for j, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map[indices[j]] = _rev_map

return rev_map
Expand Down Expand Up @@ -2452,9 +2562,9 @@ def match(self, query: "GenomicRanges") -> List[List[int]]:

for i in range(len(query)):
try:
_seqname = query.seqnames[i]
_seqname = query.get_seqnames()[i]
except Exception as _:
warn(f"'{query.seqnames[i]}' is not present in subject.")
warn(f"'{query.get_seqnames()[i]}' is not present in subject.")

_strand = query._strand[i]

Expand Down Expand Up @@ -2653,7 +2763,7 @@ def tile_by_range(
val._ranges._start[0], val._ranges.end[0] - 1, twidth
)

seqnames.extend([val.seqnames[0]] * len(all_intervals))
seqnames.extend([val.get_seqnames()[0]] * len(all_intervals))
strand.extend([int(val.strand[0])] * len(all_intervals))
starts.extend([x[0] for x in all_intervals])
widths.extend(x[1] for x in all_intervals)
Expand Down Expand Up @@ -2715,7 +2825,7 @@ def tile(
val._ranges._start[0], val._ranges.end[0] - 1, twidth
)

seqnames.extend([val.seqnames[0]] * len(all_intervals))
seqnames.extend([val.get_seqnames()[0]] * len(all_intervals))
strand.extend([int(val.strand[0])] * len(all_intervals))
starts.extend([x[0] for x in all_intervals])
widths.extend(x[1] for x in all_intervals)
Expand Down Expand Up @@ -2757,7 +2867,7 @@ def sliding_windows(self, width: int, step: int = 1) -> "GenomicRanges":
step=step,
)

seqnames.extend([val.seqnames[0]] * len(all_intervals))
seqnames.extend([val.get_seqnames()[0]] * len(all_intervals))
strand.extend([int(val.strand[0])] * len(all_intervals))
starts.extend([x[0] for x in all_intervals])
widths.extend(x[1] for x in all_intervals)
Expand Down Expand Up @@ -2814,7 +2924,7 @@ def tile_genome(

seqlen_ = seqlengths
if isinstance(seqlengths, SeqInfo):
seqlen_ = dict(zip(seqlengths.seqnames, seqlengths.seqlengths))
seqlen_ = dict(zip(seqlengths.get_seqnames(), seqlengths.seqlengths))

seqnames = []
strand = []
Expand Down
2 changes: 1 addition & 1 deletion src/genomicranges/SeqInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __iter__(self):

def __next__(self):
if self._current_index < len(self._sinfo):
iter_row_index = self._sinfo.seqnames[self._current_index]
iter_row_index = self._sinfo._seqnames[self._current_index]

iter_slice = self._sinfo[self._current_index]
self._current_index += 1
Expand Down
Loading

0 comments on commit 1ab20d9

Please sign in to comment.