Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use accessor methods to access seqnames #114

Merged
merged 10 commits into from
Jul 16, 2024
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# Changelog

## Version 0.4.27
## Version 0.4.27 - 0.4.28

- Implement `subtract` method, add tests.
- Use accessor methods to access properties especially `get_seqnames()`
- Modify search and overlap methods for strand-awareness.
- Update tests and documentation.

## Version 0.4.26

Expand Down
170 changes: 140 additions & 30 deletions src/genomicranges/GenomicRanges.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,9 @@ def __init__(
)

def _build_reverse_seqindex(self, seqinfo: SeqInfo):
self._reverse_seqindex = ut.reverse_index.build_reverse_index(seqinfo.seqnames)
self._reverse_seqindex = ut.reverse_index.build_reverse_index(
seqinfo.get_seqnames()
)

def _remove_reverse_seqindex(self):
del self._reverse_seqindex
Expand Down Expand Up @@ -354,7 +356,7 @@ def __str__(self) -> str:
header = ["seqnames", "<str>"]
_seqnames = []
for x in self._seqnames[indices]:
_seqnames.append(self._seqinfo.seqnames[x])
_seqnames.append(self._seqinfo.get_seqnames()[x])

showed = _seqnames
if insert_ellipsis:
Expand Down Expand Up @@ -407,7 +409,7 @@ def __str__(self) -> str:
+ str(len(self._seqinfo))
+ " sequences): "
+ ut.print_truncated_list(
self._seqinfo.seqnames,
self._seqinfo.get_seqnames(),
sep=" ",
include_brackets=False,
transform=lambda y: y,
Expand Down Expand Up @@ -453,9 +455,9 @@ def get_seqnames(
"""

if as_type == "factor":
return self._seqnames, self._seqinfo.seqnames
return self._seqnames, self._seqinfo.get_seqnames()
elif as_type == "list":
return [self._seqinfo.seqnames[x] for x in self._seqnames]
return [self._seqinfo.get_seqnames()[x] for x in self._seqnames]
else:
raise ValueError("Argument 'as_type' must be 'factor' or 'list'.")

Expand All @@ -481,7 +483,7 @@ def set_seqnames(

if not isinstance(seqnames, np.ndarray):
seqnames = np.asarray(
[self._seqinfo.seqnames.index(x) for x in list(seqnames)]
[self._seqinfo.get_seqnames().index(x) for x in list(seqnames)]
)

output = self._define_output(in_place)
Expand Down Expand Up @@ -570,7 +572,7 @@ def get_strand(
vector is retuned.

If ``factor``, a tuple width levels as a dictionary and
indices to ``seqinfo.seqnames`` is returned.
indices to ``seqinfo.get_seqnames()`` is returned.

If ``list``, then codes are mapped to levels and returned.

Expand Down Expand Up @@ -1684,7 +1686,7 @@ def range(
rev_map = []
groups = []

for seq in self._seqinfo.seqnames:
for seq in self._seqinfo.get_seqnames():
_iter_strands = [0] if ignore_strand is True else [1, -1, 0]
for strd in _iter_strands:
_key = f"{seq}{_granges_delim}{strd}"
Expand Down Expand Up @@ -1738,7 +1740,7 @@ def gaps(
all_grp_ranges = []
groups = []

for i, chrm in enumerate(self._seqinfo.seqnames):
for i, chrm in enumerate(self._seqinfo.get_seqnames()):
_iter_strands = [0] if ignore_strand is True else [1, -1, 0]
for strd in _iter_strands:
_key = f"{chrm}{_granges_delim}{strd}"
Expand Down Expand Up @@ -1801,7 +1803,7 @@ def disjoin(
rev_map = []
groups = []

for seq in self._seqinfo.seqnames:
for seq in self._seqinfo.get_seqnames():
_iter_strands = [0] if ignore_strand is True else [1, -1, 0]
for strd in _iter_strands:
_key = f"{seq}{_granges_delim}{strd}"
Expand Down Expand Up @@ -1930,7 +1932,7 @@ def setdiff(self, other: "GenomicRanges") -> "GenomicRanges":
range_bounds = all_combined.range(ignore_strand=True)
rb_ends = {}
for _, val in range_bounds:
rb_ends[val.seqnames[0]] = val.end[0]
rb_ends[val.get_seqnames()[0]] = val.end[0]

x_gaps = self.gaps(end=rb_ends)
x_gaps_u = x_gaps.union(other)
Expand Down Expand Up @@ -1960,7 +1962,7 @@ def intersect(self, other: "GenomicRanges") -> "GenomicRanges":
range_bounds = all_combined.range(ignore_strand=True)
rb_ends = {}
for _, val in range_bounds:
rb_ends[val.seqnames[0]] = val.end[0]
rb_ends[val.get_seqnames()[0]] = val.end[0]

_gaps = other.gaps(end=rb_ends)
# _inter = self.setdiff(_gaps)
Expand Down Expand Up @@ -2098,8 +2100,26 @@ def find_overlaps(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.find_overlaps(
Expand All @@ -2112,7 +2132,7 @@ def find_overlaps(
)

for j, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map[indices[j]] = _rev_map

return rev_map
Expand Down Expand Up @@ -2172,8 +2192,26 @@ def count_overlaps(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.find_overlaps(
Expand All @@ -2186,7 +2224,7 @@ def count_overlaps(
)

for j, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map[indices[j]] = len(_rev_map)

return rev_map
Expand Down Expand Up @@ -2246,8 +2284,26 @@ def subset_by_overlaps(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.find_overlaps(
Expand All @@ -2260,7 +2316,7 @@ def subset_by_overlaps(
)

for _, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map.extend(_rev_map)

return self[list(set(rev_map))]
Expand Down Expand Up @@ -2300,16 +2356,34 @@ def nearest(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.nearest(
query=_query_subset._ranges, select=select, delete_index=False
)

for j, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map[indices[j]] = _rev_map

return rev_map
Expand Down Expand Up @@ -2345,16 +2419,34 @@ def precede(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.precede(
query=_query_subset._ranges, select=select, delete_index=False
)

for j, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map[indices[j]] = _rev_map

return rev_map
Expand Down Expand Up @@ -2390,16 +2482,34 @@ def follow(
query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand)

for group, indices in query_chrm_grps.items():
_subset = []
if group in subject_chrm_grps:
_sub_subset = self[subject_chrm_grps[group]]
_subset.extend(subject_chrm_grps[group])

_grp_split = group.split(_granges_delim)
if _grp_split[1] != "0":
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])
else:
_any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1"
if _any_grp_fwd in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_fwd])

_any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1"
if _any_grp_rev in subject_chrm_grps:
_subset.extend(subject_chrm_grps[_any_grp_rev])

if len(_subset) > 0:
_sub_subset = self[_subset]
_query_subset = query[indices]

res_idx = _sub_subset._ranges.follow(
query=_query_subset._ranges, select=select, delete_index=False
)

for j, val in enumerate(res_idx):
_rev_map = [subject_chrm_grps[group][x] for x in val]
_rev_map = [_subset[x] for x in val]
rev_map[indices[j]] = _rev_map

return rev_map
Expand Down Expand Up @@ -2452,9 +2562,9 @@ def match(self, query: "GenomicRanges") -> List[List[int]]:

for i in range(len(query)):
try:
_seqname = query.seqnames[i]
_seqname = query.get_seqnames()[i]
except Exception as _:
warn(f"'{query.seqnames[i]}' is not present in subject.")
warn(f"'{query.get_seqnames()[i]}' is not present in subject.")

_strand = query._strand[i]

Expand Down Expand Up @@ -2653,7 +2763,7 @@ def tile_by_range(
val._ranges._start[0], val._ranges.end[0] - 1, twidth
)

seqnames.extend([val.seqnames[0]] * len(all_intervals))
seqnames.extend([val.get_seqnames()[0]] * len(all_intervals))
strand.extend([int(val.strand[0])] * len(all_intervals))
starts.extend([x[0] for x in all_intervals])
widths.extend(x[1] for x in all_intervals)
Expand Down Expand Up @@ -2715,7 +2825,7 @@ def tile(
val._ranges._start[0], val._ranges.end[0] - 1, twidth
)

seqnames.extend([val.seqnames[0]] * len(all_intervals))
seqnames.extend([val.get_seqnames()[0]] * len(all_intervals))
strand.extend([int(val.strand[0])] * len(all_intervals))
starts.extend([x[0] for x in all_intervals])
widths.extend(x[1] for x in all_intervals)
Expand Down Expand Up @@ -2757,7 +2867,7 @@ def sliding_windows(self, width: int, step: int = 1) -> "GenomicRanges":
step=step,
)

seqnames.extend([val.seqnames[0]] * len(all_intervals))
seqnames.extend([val.get_seqnames()[0]] * len(all_intervals))
strand.extend([int(val.strand[0])] * len(all_intervals))
starts.extend([x[0] for x in all_intervals])
widths.extend(x[1] for x in all_intervals)
Expand Down Expand Up @@ -2814,7 +2924,7 @@ def tile_genome(

seqlen_ = seqlengths
if isinstance(seqlengths, SeqInfo):
seqlen_ = dict(zip(seqlengths.seqnames, seqlengths.seqlengths))
seqlen_ = dict(zip(seqlengths.get_seqnames(), seqlengths.seqlengths))

seqnames = []
strand = []
Expand Down
2 changes: 1 addition & 1 deletion src/genomicranges/SeqInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __iter__(self):

def __next__(self):
if self._current_index < len(self._sinfo):
iter_row_index = self._sinfo.seqnames[self._current_index]
iter_row_index = self._sinfo._seqnames[self._current_index]

iter_slice = self._sinfo[self._current_index]
self._current_index += 1
Expand Down
Loading
Loading