Skip to content

Commit

Permalink
add flanking meth
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche committed Nov 17, 2023
1 parent 93a5787 commit 8eba7f3
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 51 deletions.
138 changes: 126 additions & 12 deletions src/genomicranges/GenomicRanges.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ def _validate_seqnames(seqnames, seqinfo, num_ranges):
if not isinstance(seqinfo, SeqInfo):
raise TypeError("'seqinfo' is not an instance of `SeqInfo` class.")

if not all(x < len(seqinfo) for x in seqnames):
raise ValueError(
"'seqnames' contains sequence name not represented in 'seqinfo'."
)


def _validate_ranges(ranges, num_ranges):
if ranges is None:
Expand Down Expand Up @@ -102,7 +107,7 @@ def __next__(self):
else None
)

iter_slice = self._gr.row(self._current_index)
iter_slice = self._gr[self._current_index]
self._current_index += 1
return (iter_row_index, iter_slice)

Expand Down Expand Up @@ -168,10 +173,8 @@ def __init__(
seqinfo = SeqInfo(seqnames=list(set(seqnames)))
self._seqinfo = seqinfo

self._reverse_seqnames = ut.reverse_index.build_reverse_index(
self._seqinfo.seqnames
)
self._seqnames = np.array([self._reverse_seqnames[x] for x in list(seqnames)])
self._reverse_seqindex = None
self._seqnames = self._sanitize_seqnames(seqnames, self._seqinfo)
self._ranges = ranges

if strand is None:
Expand All @@ -196,6 +199,21 @@ def __init__(
self._strand, self._mcols, self._names, _num_ranges
)

def _build_reverse_seqindex(self, seqinfo: SeqInfo):
self._reverse_seqindex = ut.reverse_index.build_reverse_index(seqinfo.seqnames)

def _remove_reverse_seqindex(self):
del self._reverse_seqindex

def _sanitize_seqnames(self, seqnames, seqinfo):
if self._reverse_seqindex is None:
self._build_reverse_seqindex(seqinfo)

if not isinstance(seqnames, np.ndarray):
seqnames = np.array([self._reverse_seqindex[x] for x in list(seqnames)])

return seqnames

def _define_output(self, in_place: bool = False) -> "GenomicRanges":
if in_place is True:
return self
Expand Down Expand Up @@ -533,14 +551,11 @@ def set_strand(
A modified ``GenomicRanges`` object, either as a copy of the original
or as a reference to the (in-place-modified) original.
"""
if strand is not None:
_validate_optional_attrs(strand, None, None, len(self))

if strand is None:
strand = np.repeat(0, len(self._seqnames))
else:
strand = sanitize_strand_vector(strand)
if strand is None:
strand = np.repeat(0, len(self))

strand = sanitize_strand_vector(strand)
_validate_optional_attrs(strand, None, None, len(self))
output = self._define_output(in_place)
output._strand = strand
return output
Expand Down Expand Up @@ -1009,6 +1024,105 @@ def from_pandas(cls, input: "pandas.DataFrame") -> "GenomicRanges":

return cls(ranges=ranges, seqnames=seqnames, names=names, mcols=mcols)

#####################################
######>> intra-range methods <<######
#####################################

def flank(
self,
width: int,
start: bool = True,
both: bool = False,
ignore_strand: bool = False,
in_place: bool = False,
) -> "GenomicRanges":
"""Compute flanking ranges for each range. The logic for this comes from
the `R/GenomicRanges` & `IRanges` packages.
If ``start`` is ``True`` for a given range, the flanking occurs at the `start`,
otherwise the `end`.
The `widths` of the flanks are given by the ``width`` parameter.
``width`` can be negative, in which case the flanking region is
reversed so that it represents a prefix or suffix of the range.
Usage:
`gr.flank(3, True)`, where "x" indicates a range in ``gr`` and "-" indicates the
resulting flanking region:
---xxxxxxx
If ``start`` were ``False``, the range in ``gr`` becomes
xxxxxxx---
For negative width, i.e. `gr.flank(x, -3, FALSE)`, where "*" indicates the overlap
between "x" and the result:
xxxx***
If ``both`` is ``True``, then, for all ranges in "x", the flanking regions are
extended into (or out of, if ``width`` is negative) the range, so that the result
straddles the given endpoint and has twice the width given by width.
This is illustrated below for `gr.flank(3, both=TRUE)`:
---***xxxx
Args:
width:
Width to flank by. May be negative.
start:
Whether to only flank starts. Defaults to True.
both:
Whether to flank both starts and ends. Defaults to False.
ignore_strand:
Whether to ignore strands. Defaults to False.
in_place:
Whether to modify the ``GenomicRanges`` object in place.
Returns:
A modified ``GenomicRanges`` object with the flanked regions,
either as a copy of the original or as a reference to the
(in-place-modified) original.
"""

all_starts = self.start
all_ends = self.end
all_strands = self.strand

# figure out which position to pin, start or end?
start_flags = np.repeat(start, len(all_strands))
if not ignore_strand:
start_flags = [
start != (all_strands[i] == -1) for i in range(len(all_strands))
]

new_starts = []
new_widths = []
# if both is true, then depending on start_flag, we extend it out
# I couldn't understand the scenario's with witdh <=0,
# so refer to the R implementation here
for idx in range(len(start_flags)):
sf = start_flags[idx]
tstart = 0
if both is True:
tstart = (
all_starts[idx] - abs(width)
if sf
else all_ends[idx] - abs(width)
)
else:
if width >= 0:
tstart = all_starts[idx] - abs(width) if sf else all_ends[idx]
else:
tstart = all_starts[idx] if sf else all_ends[idx] + width

new_starts.append(tstart)
new_widths.append((width * (2 if both else 1) - 1))

output = self._define_output(in_place)
output._ranges = IRanges(new_starts, new_widths)
return output


@ut.combine_sequences.register
def _combine_GenomicRanges(*x: GenomicRanges) -> GenomicRanges:
Expand Down
36 changes: 23 additions & 13 deletions tests/test_gr_methods_coverage.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,36 @@
import pytest
import pandas as pd
from genomicranges.GenomicRanges import GenomicRanges
from iranges import IRanges
from biocframe import BiocFrame
from random import random
import genomicranges

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"

df_gr = pd.DataFrame(
{
"seqnames": ["chr1", "chr2", "chr1", "chr3", "chr2"],
"starts": [101, 102, 103, 104, 109],
"ends": [112, 103, 128, 134, 111],
"strand": ["*", "-", "*", "+", "-"],
"score": range(0, 5),
"GC": [random() for _ in range(5)],
}
gr = GenomicRanges(
seqnames=[
"chr1",
"chr2",
"chr2",
"chr2",
"chr1",
"chr1",
"chr3",
"chr3",
"chr3",
"chr3",
],
ranges=IRanges(start=range(100, 110), width=[10] * 10),
strand=["-", "+", "+", "*", "*", "+", "+", "+", "-", "-"],
mcols=BiocFrame(
{
"score": range(0, 10),
"GC": [random() for _ in range(10)],
}
),
)

gr = genomicranges.from_pandas(df_gr)


def test_coverage_default():
assert gr is not None
Expand Down
55 changes: 29 additions & 26 deletions tests/test_gr_methods_flank.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,43 @@
import pandas as pd
from genomicranges.GenomicRanges import GenomicRanges
from random import random
import genomicranges
from iranges import IRanges
from biocframe import BiocFrame
import numpy as np

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"

df_gr = pd.DataFrame(
{
"seqnames": [
"chr1",
"chr2",
"chr3",
"chr2",
"chr3",
],
"starts": range(101, 106),
"ends": [112, 123, 128, 134, 111],
"strand": ["*", "-", "*", "+", "-"],
"score": range(0, 5),
"GC": [random() for _ in range(5)],
}
gr = GenomicRanges(
seqnames=[
"chr1",
"chr2",
"chr3",
"chr2",
"chr3",
],
ranges=IRanges([x for x in range(101, 106)], [11, 21, 25, 30, 5]),
strand=["*", "-", "*", "+", "-"],
mcols=BiocFrame(
{
"score": range(0, 5),
"GC": [random() for _ in range(5)],
}
),
)

gr = genomicranges.from_pandas(df_gr)


def test_flank_default():
assert gr is not None

flanked_gr = gr.flank(width=10)

print(flanked_gr.__repr__())

assert flanked_gr is not None
assert flanked_gr.column("starts") == [91, 124, 93, 94, 112]
assert flanked_gr.column("ends") == [100, 133, 102, 103, 121]
assert (flanked_gr.start == np.array([91, 123, 93, 94, 110])).all()
assert (flanked_gr.end == np.array([100, 132, 102, 103, 119])).all()


def test_flank_start_false():
Expand All @@ -44,8 +47,8 @@ def test_flank_start_false():
flanked_gr = gr.flank(width=10, start=False)

assert flanked_gr is not None
assert flanked_gr.column("starts") == [113, 92, 129, 135, 95]
assert flanked_gr.column("ends") == [122, 101, 138, 144, 104]
assert (flanked_gr.start == np.array([112, 92, 128, 134, 95])).all()
assert (flanked_gr.end == np.array([121, 101, 137, 143, 104])).all()


def test_flank_both_true():
Expand All @@ -54,8 +57,8 @@ def test_flank_both_true():
flanked_gr = gr.flank(width=10, both=True)

assert flanked_gr is not None
assert flanked_gr.column("starts") == [91, 114, 93, 94, 102]
assert flanked_gr.column("ends") == [110, 133, 112, 113, 121]
assert (flanked_gr.start == np.array([91, 113, 93, 94, 100])).all()
assert (flanked_gr.end == np.array([110, 132, 112, 113, 119])).all()


def test_flank_start_false_and_both_true():
Expand All @@ -64,5 +67,5 @@ def test_flank_start_false_and_both_true():
flanked_gr = gr.flank(width=10, start=False, both=True)

assert flanked_gr is not None
assert flanked_gr.column("starts") == [103, 92, 119, 125, 95]
assert flanked_gr.column("ends") == [122, 111, 138, 144, 114]
assert (flanked_gr.start == np.array([102, 92, 118, 124, 95])).all()
assert (flanked_gr.end == np.array([121, 111, 137, 143, 114])).all()

0 comments on commit 8eba7f3

Please sign in to comment.