diff --git a/CHANGELOG.md b/CHANGELOG.md index 1df51d7..a57a403 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,11 @@ # Changelog -## Version 0.4.27 +## Version 0.4.27 - 0.4.28 - Implement `subtract` method, add tests. +- Use accessor methods to access properties especially `get_seqnames()` +- Modify search and overlap methods for strand-awareness. +- Update tests and documentation. ## Version 0.4.26 diff --git a/src/genomicranges/GenomicRanges.py b/src/genomicranges/GenomicRanges.py index 6b86da4..5023a59 100644 --- a/src/genomicranges/GenomicRanges.py +++ b/src/genomicranges/GenomicRanges.py @@ -208,7 +208,9 @@ def __init__( ) def _build_reverse_seqindex(self, seqinfo: SeqInfo): - self._reverse_seqindex = ut.reverse_index.build_reverse_index(seqinfo.seqnames) + self._reverse_seqindex = ut.reverse_index.build_reverse_index( + seqinfo.get_seqnames() + ) def _remove_reverse_seqindex(self): del self._reverse_seqindex @@ -354,7 +356,7 @@ def __str__(self) -> str: header = ["seqnames", ""] _seqnames = [] for x in self._seqnames[indices]: - _seqnames.append(self._seqinfo.seqnames[x]) + _seqnames.append(self._seqinfo.get_seqnames()[x]) showed = _seqnames if insert_ellipsis: @@ -407,7 +409,7 @@ def __str__(self) -> str: + str(len(self._seqinfo)) + " sequences): " + ut.print_truncated_list( - self._seqinfo.seqnames, + self._seqinfo.get_seqnames(), sep=" ", include_brackets=False, transform=lambda y: y, @@ -453,9 +455,9 @@ def get_seqnames( """ if as_type == "factor": - return self._seqnames, self._seqinfo.seqnames + return self._seqnames, self._seqinfo.get_seqnames() elif as_type == "list": - return [self._seqinfo.seqnames[x] for x in self._seqnames] + return [self._seqinfo.get_seqnames()[x] for x in self._seqnames] else: raise ValueError("Argument 'as_type' must be 'factor' or 'list'.") @@ -481,7 +483,7 @@ def set_seqnames( if not isinstance(seqnames, np.ndarray): seqnames = np.asarray( - [self._seqinfo.seqnames.index(x) for x in list(seqnames)] + [self._seqinfo.get_seqnames().index(x) for x in list(seqnames)] ) output = self._define_output(in_place) @@ -570,7 +572,7 @@ def get_strand( vector is retuned. If ``factor``, a tuple width levels as a dictionary and - indices to ``seqinfo.seqnames`` is returned. + indices to ``seqinfo.get_seqnames()`` is returned. If ``list``, then codes are mapped to levels and returned. @@ -1684,7 +1686,7 @@ def range( rev_map = [] groups = [] - for seq in self._seqinfo.seqnames: + for seq in self._seqinfo.get_seqnames(): _iter_strands = [0] if ignore_strand is True else [1, -1, 0] for strd in _iter_strands: _key = f"{seq}{_granges_delim}{strd}" @@ -1738,7 +1740,7 @@ def gaps( all_grp_ranges = [] groups = [] - for i, chrm in enumerate(self._seqinfo.seqnames): + for i, chrm in enumerate(self._seqinfo.get_seqnames()): _iter_strands = [0] if ignore_strand is True else [1, -1, 0] for strd in _iter_strands: _key = f"{chrm}{_granges_delim}{strd}" @@ -1801,7 +1803,7 @@ def disjoin( rev_map = [] groups = [] - for seq in self._seqinfo.seqnames: + for seq in self._seqinfo.get_seqnames(): _iter_strands = [0] if ignore_strand is True else [1, -1, 0] for strd in _iter_strands: _key = f"{seq}{_granges_delim}{strd}" @@ -1930,7 +1932,7 @@ def setdiff(self, other: "GenomicRanges") -> "GenomicRanges": range_bounds = all_combined.range(ignore_strand=True) rb_ends = {} for _, val in range_bounds: - rb_ends[val.seqnames[0]] = val.end[0] + rb_ends[val.get_seqnames()[0]] = val.end[0] x_gaps = self.gaps(end=rb_ends) x_gaps_u = x_gaps.union(other) @@ -1960,7 +1962,7 @@ def intersect(self, other: "GenomicRanges") -> "GenomicRanges": range_bounds = all_combined.range(ignore_strand=True) rb_ends = {} for _, val in range_bounds: - rb_ends[val.seqnames[0]] = val.end[0] + rb_ends[val.get_seqnames()[0]] = val.end[0] _gaps = other.gaps(end=rb_ends) # _inter = self.setdiff(_gaps) @@ -2098,8 +2100,26 @@ def find_overlaps( query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand) for group, indices in query_chrm_grps.items(): + _subset = [] if group in subject_chrm_grps: - _sub_subset = self[subject_chrm_grps[group]] + _subset.extend(subject_chrm_grps[group]) + + _grp_split = group.split(_granges_delim) + if _grp_split[1] != "0": + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + else: + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + + _any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1" + if _any_grp_rev in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_rev]) + + if len(_subset) > 0: + _sub_subset = self[_subset] _query_subset = query[indices] res_idx = _sub_subset._ranges.find_overlaps( @@ -2112,7 +2132,7 @@ def find_overlaps( ) for j, val in enumerate(res_idx): - _rev_map = [subject_chrm_grps[group][x] for x in val] + _rev_map = [_subset[x] for x in val] rev_map[indices[j]] = _rev_map return rev_map @@ -2172,8 +2192,26 @@ def count_overlaps( query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand) for group, indices in query_chrm_grps.items(): + _subset = [] if group in subject_chrm_grps: - _sub_subset = self[subject_chrm_grps[group]] + _subset.extend(subject_chrm_grps[group]) + + _grp_split = group.split(_granges_delim) + if _grp_split[1] != "0": + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + else: + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + + _any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1" + if _any_grp_rev in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_rev]) + + if len(_subset) > 0: + _sub_subset = self[_subset] _query_subset = query[indices] res_idx = _sub_subset._ranges.find_overlaps( @@ -2186,7 +2224,7 @@ def count_overlaps( ) for j, val in enumerate(res_idx): - _rev_map = [subject_chrm_grps[group][x] for x in val] + _rev_map = [_subset[x] for x in val] rev_map[indices[j]] = len(_rev_map) return rev_map @@ -2246,8 +2284,26 @@ def subset_by_overlaps( query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand) for group, indices in query_chrm_grps.items(): + _subset = [] if group in subject_chrm_grps: - _sub_subset = self[subject_chrm_grps[group]] + _subset.extend(subject_chrm_grps[group]) + + _grp_split = group.split(_granges_delim) + if _grp_split[1] != "0": + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + else: + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + + _any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1" + if _any_grp_rev in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_rev]) + + if len(_subset) > 0: + _sub_subset = self[_subset] _query_subset = query[indices] res_idx = _sub_subset._ranges.find_overlaps( @@ -2260,7 +2316,7 @@ def subset_by_overlaps( ) for _, val in enumerate(res_idx): - _rev_map = [subject_chrm_grps[group][x] for x in val] + _rev_map = [_subset[x] for x in val] rev_map.extend(_rev_map) return self[list(set(rev_map))] @@ -2300,8 +2356,26 @@ def nearest( query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand) for group, indices in query_chrm_grps.items(): + _subset = [] if group in subject_chrm_grps: - _sub_subset = self[subject_chrm_grps[group]] + _subset.extend(subject_chrm_grps[group]) + + _grp_split = group.split(_granges_delim) + if _grp_split[1] != "0": + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + else: + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + + _any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1" + if _any_grp_rev in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_rev]) + + if len(_subset) > 0: + _sub_subset = self[_subset] _query_subset = query[indices] res_idx = _sub_subset._ranges.nearest( @@ -2309,7 +2383,7 @@ def nearest( ) for j, val in enumerate(res_idx): - _rev_map = [subject_chrm_grps[group][x] for x in val] + _rev_map = [_subset[x] for x in val] rev_map[indices[j]] = _rev_map return rev_map @@ -2345,8 +2419,26 @@ def precede( query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand) for group, indices in query_chrm_grps.items(): + _subset = [] if group in subject_chrm_grps: - _sub_subset = self[subject_chrm_grps[group]] + _subset.extend(subject_chrm_grps[group]) + + _grp_split = group.split(_granges_delim) + if _grp_split[1] != "0": + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + else: + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + + _any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1" + if _any_grp_rev in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_rev]) + + if len(_subset) > 0: + _sub_subset = self[_subset] _query_subset = query[indices] res_idx = _sub_subset._ranges.precede( @@ -2354,7 +2446,7 @@ def precede( ) for j, val in enumerate(res_idx): - _rev_map = [subject_chrm_grps[group][x] for x in val] + _rev_map = [_subset[x] for x in val] rev_map[indices[j]] = _rev_map return rev_map @@ -2390,8 +2482,26 @@ def follow( query_chrm_grps = query._group_indices_by_chrm(ignore_strand=ignore_strand) for group, indices in query_chrm_grps.items(): + _subset = [] if group in subject_chrm_grps: - _sub_subset = self[subject_chrm_grps[group]] + _subset.extend(subject_chrm_grps[group]) + + _grp_split = group.split(_granges_delim) + if _grp_split[1] != "0": + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}0" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + else: + _any_grp_fwd = f"{_grp_split[0]}{_granges_delim}1" + if _any_grp_fwd in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_fwd]) + + _any_grp_rev = f"{_grp_split[0]}{_granges_delim}-1" + if _any_grp_rev in subject_chrm_grps: + _subset.extend(subject_chrm_grps[_any_grp_rev]) + + if len(_subset) > 0: + _sub_subset = self[_subset] _query_subset = query[indices] res_idx = _sub_subset._ranges.follow( @@ -2399,7 +2509,7 @@ def follow( ) for j, val in enumerate(res_idx): - _rev_map = [subject_chrm_grps[group][x] for x in val] + _rev_map = [_subset[x] for x in val] rev_map[indices[j]] = _rev_map return rev_map @@ -2452,9 +2562,9 @@ def match(self, query: "GenomicRanges") -> List[List[int]]: for i in range(len(query)): try: - _seqname = query.seqnames[i] + _seqname = query.get_seqnames()[i] except Exception as _: - warn(f"'{query.seqnames[i]}' is not present in subject.") + warn(f"'{query.get_seqnames()[i]}' is not present in subject.") _strand = query._strand[i] @@ -2653,7 +2763,7 @@ def tile_by_range( val._ranges._start[0], val._ranges.end[0] - 1, twidth ) - seqnames.extend([val.seqnames[0]] * len(all_intervals)) + seqnames.extend([val.get_seqnames()[0]] * len(all_intervals)) strand.extend([int(val.strand[0])] * len(all_intervals)) starts.extend([x[0] for x in all_intervals]) widths.extend(x[1] for x in all_intervals) @@ -2715,7 +2825,7 @@ def tile( val._ranges._start[0], val._ranges.end[0] - 1, twidth ) - seqnames.extend([val.seqnames[0]] * len(all_intervals)) + seqnames.extend([val.get_seqnames()[0]] * len(all_intervals)) strand.extend([int(val.strand[0])] * len(all_intervals)) starts.extend([x[0] for x in all_intervals]) widths.extend(x[1] for x in all_intervals) @@ -2757,7 +2867,7 @@ def sliding_windows(self, width: int, step: int = 1) -> "GenomicRanges": step=step, ) - seqnames.extend([val.seqnames[0]] * len(all_intervals)) + seqnames.extend([val.get_seqnames()[0]] * len(all_intervals)) strand.extend([int(val.strand[0])] * len(all_intervals)) starts.extend([x[0] for x in all_intervals]) widths.extend(x[1] for x in all_intervals) @@ -2814,7 +2924,7 @@ def tile_genome( seqlen_ = seqlengths if isinstance(seqlengths, SeqInfo): - seqlen_ = dict(zip(seqlengths.seqnames, seqlengths.seqlengths)) + seqlen_ = dict(zip(seqlengths.get_seqnames(), seqlengths.seqlengths)) seqnames = [] strand = [] diff --git a/src/genomicranges/SeqInfo.py b/src/genomicranges/SeqInfo.py index 826af48..06e8be7 100644 --- a/src/genomicranges/SeqInfo.py +++ b/src/genomicranges/SeqInfo.py @@ -65,7 +65,7 @@ def __iter__(self): def __next__(self): if self._current_index < len(self._sinfo): - iter_row_index = self._sinfo.seqnames[self._current_index] + iter_row_index = self._sinfo._seqnames[self._current_index] iter_slice = self._sinfo[self._current_index] self._current_index += 1 diff --git a/tests/test_gr_binnedAvg.py b/tests/test_gr_binnedAvg.py index f2c4802..bdea117 100644 --- a/tests/test_gr_binnedAvg.py +++ b/tests/test_gr_binnedAvg.py @@ -44,4 +44,4 @@ def test_binned_average(): res = subject.binned_average(bins=query, scorename="score", outname="binned_score") assert res is not None - assert res.mcols.get_column("binned_score") == [2] + assert res.mcols.get_column("binned_score") == [3] diff --git a/tests/test_gr_methods_search.py b/tests/test_gr_methods_search.py index 8cc33f8..982fc88 100644 --- a/tests/test_gr_methods_search.py +++ b/tests/test_gr_methods_search.py @@ -40,7 +40,7 @@ def test_nearest(): query_hits = gr.nearest(q_gr) assert query_hits is not None - assert query_hits == [[4], [3], []] + assert query_hits == [[5], [1, 2, 3], [9]] def test_precede(): @@ -68,7 +68,7 @@ def test_follow(): query_hits = gr.follow(q_gr) assert query_hits is not None - assert query_hits == [[4], [], []] + assert query_hits == [[5], [], [9]] def test_distance(): diff --git a/tests/test_gr_overlaps.py b/tests/test_gr_overlaps.py index b6eed30..bb594ff 100644 --- a/tests/test_gr_overlaps.py +++ b/tests/test_gr_overlaps.py @@ -49,7 +49,7 @@ def test_find_overlaps(): assert res is not None assert isinstance(res, list) - assert res == [[1, 2]] + assert res == [[1, 2, 3]] def test_find_overlaps_query_type(): @@ -59,7 +59,20 @@ def test_find_overlaps_query_type(): res = subject.find_overlaps(query, query_type="within") assert res is not None - assert res == [[1, 2]] + assert res == [[1, 2, 3]] + + +def test_find_overlaps_rtrip(): + x = GenomicRanges(["chr1", "chr1"], IRanges([2, 9], [7, 19]), strand=["+", "-"]) + y = GenomicRanges(["chr1"], IRanges([5], [10]), strand=["*"]) + + resxy = x.find_overlaps(y) + assert resxy is not None + assert resxy == [[0, 1]] + + resyx = y.find_overlaps(x) + assert resyx is not None + assert resyx == [[0], [0]] def test_count_overlaps(): @@ -70,7 +83,7 @@ def test_count_overlaps(): assert res is not None assert isinstance(res, list) - assert res == [2] + assert res == [3] def test_subset_by_overlaps(): @@ -87,4 +100,4 @@ def test_subset_by_overlaps(): assert res is not None assert isinstance(res, GenomicRanges) - assert len(res) == 2 + assert len(res) == 3