add flanking meth

BiocPy · Nov 17, 2023 · 8eba7f3 · 8eba7f3
1 parent 93a5787
commit 8eba7f3
Show file tree

Hide file tree

Showing 3 changed files with 178 additions and 51 deletions.
diff --git a/src/genomicranges/GenomicRanges.py b/src/genomicranges/GenomicRanges.py
@@ -37,6 +37,11 @@ def _validate_seqnames(seqnames, seqinfo, num_ranges):
     if not isinstance(seqinfo, SeqInfo):
         raise TypeError("'seqinfo' is not an instance of `SeqInfo` class.")
 
+    if not all(x < len(seqinfo) for x in seqnames):
+        raise ValueError(
+            "'seqnames' contains sequence name not represented in 'seqinfo'."
+        )
+
 
 def _validate_ranges(ranges, num_ranges):
     if ranges is None:
@@ -102,7 +107,7 @@ def __next__(self):
                 else None
             )
 
-            iter_slice = self._gr.row(self._current_index)
+            iter_slice = self._gr[self._current_index]
             self._current_index += 1
             return (iter_row_index, iter_slice)
 
@@ -168,10 +173,8 @@ def __init__(
             seqinfo = SeqInfo(seqnames=list(set(seqnames)))
         self._seqinfo = seqinfo
 
-        self._reverse_seqnames = ut.reverse_index.build_reverse_index(
-            self._seqinfo.seqnames
-        )
-        self._seqnames = np.array([self._reverse_seqnames[x] for x in list(seqnames)])
+        self._reverse_seqindex = None
+        self._seqnames = self._sanitize_seqnames(seqnames, self._seqinfo)
         self._ranges = ranges
 
         if strand is None:
@@ -196,6 +199,21 @@ def __init__(
                 self._strand, self._mcols, self._names, _num_ranges
             )
 
+    def _build_reverse_seqindex(self, seqinfo: SeqInfo):
+        self._reverse_seqindex = ut.reverse_index.build_reverse_index(seqinfo.seqnames)
+
+    def _remove_reverse_seqindex(self):
+        del self._reverse_seqindex
+
+    def _sanitize_seqnames(self, seqnames, seqinfo):
+        if self._reverse_seqindex is None:
+            self._build_reverse_seqindex(seqinfo)
+
+        if not isinstance(seqnames, np.ndarray):
+            seqnames = np.array([self._reverse_seqindex[x] for x in list(seqnames)])
+
+        return seqnames
+
     def _define_output(self, in_place: bool = False) -> "GenomicRanges":
         if in_place is True:
             return self
@@ -533,14 +551,11 @@ def set_strand(
             A modified ``GenomicRanges`` object, either as a copy of the original
             or as a reference to the (in-place-modified) original.
         """
-        if strand is not None:
-            _validate_optional_attrs(strand, None, None, len(self))
-
-            if strand is None:
-                strand = np.repeat(0, len(self._seqnames))
-            else:
-                strand = sanitize_strand_vector(strand)
+        if strand is None:
+            strand = np.repeat(0, len(self))
 
+        strand = sanitize_strand_vector(strand)
+        _validate_optional_attrs(strand, None, None, len(self))
         output = self._define_output(in_place)
         output._strand = strand
         return output
@@ -1009,6 +1024,105 @@ def from_pandas(cls, input: "pandas.DataFrame") -> "GenomicRanges":
 
         return cls(ranges=ranges, seqnames=seqnames, names=names, mcols=mcols)
 
+    #####################################
+    ######>> intra-range methods <<######
+    #####################################
+
+    def flank(
+        self,
+        width: int,
+        start: bool = True,
+        both: bool = False,
+        ignore_strand: bool = False,
+        in_place: bool = False,
+    ) -> "GenomicRanges":
+        """Compute flanking ranges for each range. The logic for this comes from
+        the `R/GenomicRanges` & `IRanges` packages.
+
+        If ``start`` is ``True`` for a given range, the flanking occurs at the `start`,
+        otherwise the `end`.
+        The `widths` of the flanks are given by the ``width`` parameter.
+
+        ``width`` can be negative, in which case the flanking region is
+        reversed so that it represents a prefix or suffix of the range.
+
+        Usage:
+
+            `gr.flank(3, True)`, where "x" indicates a range in ``gr`` and "-" indicates the
+            resulting flanking region:
+                ---xxxxxxx
+            If ``start`` were ``False``, the range in ``gr`` becomes
+                xxxxxxx---
+            For negative width, i.e. `gr.flank(x, -3, FALSE)`, where "*" indicates the overlap
+            between "x" and the result:
+                xxxx***
+            If ``both`` is ``True``, then, for all ranges in "x", the flanking regions are
+            extended into (or out of, if ``width`` is negative) the range, so that the result
+            straddles the given endpoint and has twice the width given by width.
+
+            This is illustrated below for `gr.flank(3, both=TRUE)`:
+                ---***xxxx
+
+        Args:
+            width:
+                Width to flank by. May be negative.
+
+            start:
+                Whether to only flank starts. Defaults to True.
+
+            both:
+                Whether to flank both starts and ends. Defaults to False.
+
+            ignore_strand:
+                Whether to ignore strands. Defaults to False.
+
+            in_place:
+                Whether to modify the ``GenomicRanges`` object in place.
+
+        Returns:
+            A modified ``GenomicRanges`` object with the flanked regions,
+            either as a copy of the original or as a reference to the
+            (in-place-modified) original.
+        """
+
+        all_starts = self.start
+        all_ends = self.end
+        all_strands = self.strand
+
+        # figure out which position to pin, start or end?
+        start_flags = np.repeat(start, len(all_strands))
+        if not ignore_strand:
+            start_flags = [
+                start != (all_strands[i] == -1) for i in range(len(all_strands))
+            ]
+
+        new_starts = []
+        new_widths = []
+        # if both is true, then depending on start_flag, we extend it out
+        # I couldn't understand the scenario's with witdh <=0,
+        # so refer to the R implementation here
+        for idx in range(len(start_flags)):
+            sf = start_flags[idx]
+            tstart = 0
+            if both is True:
+                tstart = (
+                    all_starts[idx] - abs(width)
+                    if sf
+                    else all_ends[idx] - abs(width)
+                )
+            else:
+                if width >= 0:
+                    tstart = all_starts[idx] - abs(width) if sf else all_ends[idx]
+                else:
+                    tstart = all_starts[idx] if sf else all_ends[idx] + width
+
+            new_starts.append(tstart)
+            new_widths.append((width * (2 if both else 1) - 1))
+
+        output = self._define_output(in_place)
+        output._ranges = IRanges(new_starts, new_widths)
+        return output
+
 
 @ut.combine_sequences.register
 def _combine_GenomicRanges(*x: GenomicRanges) -> GenomicRanges:

diff --git a/tests/test_gr_methods_coverage.py b/tests/test_gr_methods_coverage.py
@@ -1,26 +1,36 @@
 import pytest
-import pandas as pd
 from genomicranges.GenomicRanges import GenomicRanges
+from iranges import IRanges
+from biocframe import BiocFrame
 from random import random
-import genomicranges
 
 __author__ = "jkanche"
 __copyright__ = "jkanche"
 __license__ = "MIT"
 
-df_gr = pd.DataFrame(
-    {
-        "seqnames": ["chr1", "chr2", "chr1", "chr3", "chr2"],
-        "starts": [101, 102, 103, 104, 109],
-        "ends": [112, 103, 128, 134, 111],
-        "strand": ["*", "-", "*", "+", "-"],
-        "score": range(0, 5),
-        "GC": [random() for _ in range(5)],
-    }
+gr = GenomicRanges(
+    seqnames=[
+        "chr1",
+        "chr2",
+        "chr2",
+        "chr2",
+        "chr1",
+        "chr1",
+        "chr3",
+        "chr3",
+        "chr3",
+        "chr3",
+    ],
+    ranges=IRanges(start=range(100, 110), width=[10] * 10),
+    strand=["-", "+", "+", "*", "*", "+", "+", "+", "-", "-"],
+    mcols=BiocFrame(
+        {
+            "score": range(0, 10),
+            "GC": [random() for _ in range(10)],
+        }
+    ),
 )
 
-gr = genomicranges.from_pandas(df_gr)
-
 
 def test_coverage_default():
     assert gr is not None

diff --git a/tests/test_gr_methods_flank.py b/tests/test_gr_methods_flank.py
@@ -2,40 +2,43 @@
 import pandas as pd
 from genomicranges.GenomicRanges import GenomicRanges
 from random import random
-import genomicranges
+from iranges import IRanges
+from biocframe import BiocFrame
+import numpy as np
 
 __author__ = "jkanche"
 __copyright__ = "jkanche"
 __license__ = "MIT"
 
-df_gr = pd.DataFrame(
-    {
-        "seqnames": [
-            "chr1",
-            "chr2",
-            "chr3",
-            "chr2",
-            "chr3",
-        ],
-        "starts": range(101, 106),
-        "ends": [112, 123, 128, 134, 111],
-        "strand": ["*", "-", "*", "+", "-"],
-        "score": range(0, 5),
-        "GC": [random() for _ in range(5)],
-    }
+gr = GenomicRanges(
+    seqnames=[
+        "chr1",
+        "chr2",
+        "chr3",
+        "chr2",
+        "chr3",
+    ],
+    ranges=IRanges([x for x in range(101, 106)], [11, 21, 25, 30, 5]),
+    strand=["*", "-", "*", "+", "-"],
+    mcols=BiocFrame(
+        {
+            "score": range(0, 5),
+            "GC": [random() for _ in range(5)],
+        }
+    ),
 )
 
-gr = genomicranges.from_pandas(df_gr)
-
 
 def test_flank_default():
     assert gr is not None
 
     flanked_gr = gr.flank(width=10)
 
+    print(flanked_gr.__repr__())
+
     assert flanked_gr is not None
-    assert flanked_gr.column("starts") == [91, 124, 93, 94, 112]
-    assert flanked_gr.column("ends") == [100, 133, 102, 103, 121]
+    assert (flanked_gr.start == np.array([91, 123, 93, 94, 110])).all()
+    assert (flanked_gr.end == np.array([100, 132, 102, 103, 119])).all()
 
 
 def test_flank_start_false():
@@ -44,8 +47,8 @@ def test_flank_start_false():
     flanked_gr = gr.flank(width=10, start=False)
 
     assert flanked_gr is not None
-    assert flanked_gr.column("starts") == [113, 92, 129, 135, 95]
-    assert flanked_gr.column("ends") == [122, 101, 138, 144, 104]
+    assert (flanked_gr.start == np.array([112, 92, 128, 134, 95])).all()
+    assert (flanked_gr.end == np.array([121, 101, 137, 143, 104])).all()
 
 
 def test_flank_both_true():
@@ -54,8 +57,8 @@ def test_flank_both_true():
     flanked_gr = gr.flank(width=10, both=True)
 
     assert flanked_gr is not None
-    assert flanked_gr.column("starts") == [91, 114, 93, 94, 102]
-    assert flanked_gr.column("ends") == [110, 133, 112, 113, 121]
+    assert (flanked_gr.start == np.array([91, 113, 93, 94, 100])).all()
+    assert (flanked_gr.end == np.array([110, 132, 112, 113, 119])).all()
 
 
 def test_flank_start_false_and_both_true():
@@ -64,5 +67,5 @@ def test_flank_start_false_and_both_true():
     flanked_gr = gr.flank(width=10, start=False, both=True)
 
     assert flanked_gr is not None
-    assert flanked_gr.column("starts") == [103, 92, 119, 125, 95]
-    assert flanked_gr.column("ends") == [122, 111, 138, 144, 114]
+    assert (flanked_gr.start == np.array([102, 92, 118, 124, 95])).all()
+    assert (flanked_gr.end == np.array([121, 111, 137, 143, 114])).all()