Skip to content

Commit

Permalink
Methods to split GenomicRanges to GenomicRangesList and vice-versa (
Browse files Browse the repository at this point in the history
#109)

- Method to split `GenomicRanges` by a list of groups.
- Coerce `GenomicRangesList` to `GenomicRanges`.
- Add tests and documentation.
  • Loading branch information
jkanche authored Jul 12, 2024
1 parent c5d0cb9 commit dd21da1
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 4 deletions.
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# Changelog

## Version 0.4.21
## Version 0.4.25

- Method to split `GenomicRanges` by a list of groups.
- Coerce `GenomicRangesList` to `GenomicRanges`.
- Add tests and documentation.


## Version 0.4.21 - 0.4.24

- Optimize `intersect` operation on large number of genomic regions
- Add a `fast_combine_granges` method that only concatenates seqnames and intervals.
Expand Down
44 changes: 43 additions & 1 deletion src/genomicranges/GenomicRanges.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .SeqInfo import SeqInfo, merge_SeqInfo
from .utils import (
create_np_vector,
group_by_indices,
sanitize_strand_vector,
slide_intervals,
split_intervals,
Expand Down Expand Up @@ -331,7 +332,7 @@ def __str__(self) -> str:
A pretty-printed string containing the contents of this ``GenomicRanges``.
"""
output = f"GenomicRanges with {len(self)} range{'s' if len(self) != 1 else ''}"
output += f" and {len(self._mcols)} metadata column{'s' if len(self._mcols) != 1 else ''}\n"
output += f" and {len(self._mcols.get_column_names())} metadata column{'s' if len(self._mcols.get_column_names()) != 1 else ''}\n"

nr = len(self)
added_table = False
Expand Down Expand Up @@ -2892,6 +2893,47 @@ def binned_average(
output._mcols.set_column(outname, outvec, in_place=True)
return output

#######################
######>> split <<######
#######################

def split(self, groups: list) -> "GenomicRangesList":
"""Split the `GenomicRanges` object into a :py:class:`~genomicranges.GenomicRangesList.GenomicRangesList`.
Args:
groups:
A list specifying the groups or factors to split by.
Must have the same length as the number of genomic elements
in the object.
Returns:
A `GenomicRangesList` containing the groups and their
corresponding elements.
"""

if len(groups) != len(self):
raise ValueError(
"Number of groups must match the number of genomic elements."
)

gdict = group_by_indices(groups=groups)

_names = []
_grs = []

for k, v in gdict.items():
_names.append(k)
_grs.append(self[v])

from .GenomicRangesList import GenomicRangesList

return GenomicRangesList(ranges=_grs, names=_names)

#######################
######>> empty <<######
#######################

@classmethod
def empty(cls):
"""Create an zero-length `GenomicRanges` object.
Expand Down
30 changes: 28 additions & 2 deletions src/genomicranges/GenomicRangesList.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def __str__(self) -> str:
output = (
f"GenomicRangesList with {len(self)} range{'s' if len(self) != 1 else ''}"
)
output += f" and {len(self._mcols)} metadata column{'s' if len(self._mcols) != 1 else ''}\n"
output += f" and {len(self._mcols.get_column_names())} metadata column{'s' if len(self._mcols.get_column_names()) != 1 else ''}\n"

if isinstance(self._ranges, GenomicRanges) and len(self._ranges) == 0:
output += "--- empty genomic ranges list ---"
Expand Down Expand Up @@ -729,7 +729,7 @@ def range_lengths(self) -> dict:
###################################

def to_pandas(self) -> "pandas.DataFrame":
"""Coerce object to a :py:class:`pandas.DataFrame`.
"""Coerce object to a :py:class:`~pandas.DataFrame`.
Returns:
A :py:class:`~pandas.DataFrame` object.
Expand Down Expand Up @@ -851,6 +851,32 @@ def empty(cls, n: int):

return cls(ranges=GenomicRanges.empty(), range_lengths=_range_lengths)

###############################
######>> to granges <<#########
###############################

def to_genomic_ranges(self) -> GenomicRanges:
"""Coerce object to a :py:class:`~genomicranges.GenomicRanges.GenomicRanges`.
Returns:
A :py:class:`~genomicranges.GenomicRanges.GenomicRanges` object.
"""
_combined_ranges = ut.combine_sequences(*self._ranges)
_combined_names = None

if self._names is not None:
_combined_names = []
for i, rl in enumerate(self._range_lengths):
_combined_names.extend([self._names[i]] * rl)

return _combined_ranges.set_names(_combined_names)

return _combined_ranges

def to_granges(self) -> GenomicRanges:
"""Alias to :py:meth:`~to_genomic_ranges`."""
return self.to_genomic_ranges()


@ut.combine_sequences.register(GenomicRangesList)
def _combine_grl(*x: GenomicRangesList):
Expand Down
10 changes: 10 additions & 0 deletions src/genomicranges/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from itertools import groupby
from typing import List, Optional, Sequence, Tuple, Union

import biocutils as ut
Expand Down Expand Up @@ -206,3 +207,12 @@ def create_np_vector(
_ = [revmap[x].append(idx + 1) for x in range(i[0] - 1, i[1])]

return cov, revmap


def group_by_indices(groups: list) -> dict:
return {
k: [x[0] for x in v]
for k, v in groupby(
sorted(enumerate(groups), key=lambda x: x[1]), lambda x: x[1]
)
}
85 changes: 85 additions & 0 deletions tests/test_gr_to_grl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pytest
import pandas as pd
import numpy as np
from genomicranges import GenomicRanges, GenomicRangesList
from biocframe import BiocFrame
from iranges import IRanges
from random import random
import genomicranges

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"

subject = GenomicRanges(
seqnames=[
"chr1",
"chr2",
"chr2",
"chr2",
"chr1",
"chr1",
"chr3",
"chr3",
"chr3",
"chr3",
],
ranges=IRanges(range(101, 111), range(121, 131)),
strand=["*", "-", "-", "*", "*", "+", "+", "+", "-", "-"],
mcols=BiocFrame(
{
"score": range(0, 10),
"GC": [random() for _ in range(10)],
}
),
)


def test_split():
assert subject is not None

splits = subject.split(
[
"chr1",
"chr2",
"chr2",
"chr2",
"chr1",
"chr1",
"chr3",
"chr3",
"chr3",
"chr3",
]
)

assert splits is not None
assert isinstance(splits, GenomicRangesList)
assert len(splits) == 3
print(splits.element_nrows())
assert sum(splits.get_range_lengths()) == len(subject)


def test_to_granges():
assert subject is not None

splits = subject.split(
[
"chr1",
"chr2",
"chr2",
"chr2",
"chr1",
"chr1",
"chr3",
"chr3",
"chr3",
"chr3",
]
)

roundtrip = splits.to_genomic_ranges()

assert roundtrip is not None
assert isinstance(roundtrip, GenomicRanges)
assert len(roundtrip) == len(subject)

0 comments on commit dd21da1

Please sign in to comment.