Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Methods to split GenomicRanges to GenomicRangesList and vice-versa #109

Merged
merged 7 commits into from
Jul 12, 2024
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# Changelog

## Version 0.4.21
## Version 0.4.25

- Method to split `GenomicRanges` by a list of groups.
- Coerce `GenomicRangesList` to `GenomicRanges`.
- Add tests and documentation.


## Version 0.4.21 - 0.4.24

- Optimize `intersect` operation on large number of genomic regions
- Add a `fast_combine_granges` method that only concatenates seqnames and intervals.
Expand Down
44 changes: 43 additions & 1 deletion src/genomicranges/GenomicRanges.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .SeqInfo import SeqInfo, merge_SeqInfo
from .utils import (
create_np_vector,
group_by_indices,
sanitize_strand_vector,
slide_intervals,
split_intervals,
Expand Down Expand Up @@ -331,7 +332,7 @@ def __str__(self) -> str:
A pretty-printed string containing the contents of this ``GenomicRanges``.
"""
output = f"GenomicRanges with {len(self)} range{'s' if len(self) != 1 else ''}"
output += f" and {len(self._mcols)} metadata column{'s' if len(self._mcols) != 1 else ''}\n"
output += f" and {len(self._mcols.get_column_names())} metadata column{'s' if len(self._mcols.get_column_names()) != 1 else ''}\n"

nr = len(self)
added_table = False
Expand Down Expand Up @@ -2892,6 +2893,47 @@ def binned_average(
output._mcols.set_column(outname, outvec, in_place=True)
return output

#######################
######>> split <<######
#######################

def split(self, groups: list) -> "GenomicRangesList":
"""Split the `GenomicRanges` object into a :py:class:`~genomicranges.GenomicRangesList.GenomicRangesList`.

Args:
groups:
A list specifying the groups or factors to split by.

Must have the same length as the number of genomic elements
in the object.

Returns:
A `GenomicRangesList` containing the groups and their
corresponding elements.
"""

if len(groups) != len(self):
raise ValueError(
"Number of groups must match the number of genomic elements."
)

gdict = group_by_indices(groups=groups)

_names = []
_grs = []

for k, v in gdict.items():
_names.append(k)
_grs.append(self[v])

from .GenomicRangesList import GenomicRangesList

return GenomicRangesList(ranges=_grs, names=_names)

#######################
######>> empty <<######
#######################

@classmethod
def empty(cls):
"""Create an zero-length `GenomicRanges` object.
Expand Down
30 changes: 28 additions & 2 deletions src/genomicranges/GenomicRangesList.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def __str__(self) -> str:
output = (
f"GenomicRangesList with {len(self)} range{'s' if len(self) != 1 else ''}"
)
output += f" and {len(self._mcols)} metadata column{'s' if len(self._mcols) != 1 else ''}\n"
output += f" and {len(self._mcols.get_column_names())} metadata column{'s' if len(self._mcols.get_column_names()) != 1 else ''}\n"

if isinstance(self._ranges, GenomicRanges) and len(self._ranges) == 0:
output += "--- empty genomic ranges list ---"
Expand Down Expand Up @@ -729,7 +729,7 @@ def range_lengths(self) -> dict:
###################################

def to_pandas(self) -> "pandas.DataFrame":
"""Coerce object to a :py:class:`pandas.DataFrame`.
"""Coerce object to a :py:class:`~pandas.DataFrame`.

Returns:
A :py:class:`~pandas.DataFrame` object.
Expand Down Expand Up @@ -851,6 +851,32 @@ def empty(cls, n: int):

return cls(ranges=GenomicRanges.empty(), range_lengths=_range_lengths)

###############################
######>> to granges <<#########
###############################

def to_genomic_ranges(self) -> GenomicRanges:
"""Coerce object to a :py:class:`~genomicranges.GenomicRanges.GenomicRanges`.

Returns:
A :py:class:`~genomicranges.GenomicRanges.GenomicRanges` object.
"""
_combined_ranges = ut.combine_sequences(*self._ranges)
_combined_names = None

if self._names is not None:
_combined_names = []
for i, rl in enumerate(self._range_lengths):
_combined_names.extend([self._names[i]] * rl)

return _combined_ranges.set_names(_combined_names)

return _combined_ranges

def to_granges(self) -> GenomicRanges:
"""Alias to :py:meth:`~to_genomic_ranges`."""
return self.to_genomic_ranges()


@ut.combine_sequences.register(GenomicRangesList)
def _combine_grl(*x: GenomicRangesList):
Expand Down
10 changes: 10 additions & 0 deletions src/genomicranges/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from itertools import groupby
from typing import List, Optional, Sequence, Tuple, Union

import biocutils as ut
Expand Down Expand Up @@ -206,3 +207,12 @@ def create_np_vector(
_ = [revmap[x].append(idx + 1) for x in range(i[0] - 1, i[1])]

return cov, revmap


def group_by_indices(groups: list) -> dict:
return {
k: [x[0] for x in v]
for k, v in groupby(
sorted(enumerate(groups), key=lambda x: x[1]), lambda x: x[1]
)
}
85 changes: 85 additions & 0 deletions tests/test_gr_to_grl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pytest
import pandas as pd
import numpy as np
from genomicranges import GenomicRanges, GenomicRangesList
from biocframe import BiocFrame
from iranges import IRanges
from random import random
import genomicranges

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"

subject = GenomicRanges(
seqnames=[
"chr1",
"chr2",
"chr2",
"chr2",
"chr1",
"chr1",
"chr3",
"chr3",
"chr3",
"chr3",
],
ranges=IRanges(range(101, 111), range(121, 131)),
strand=["*", "-", "-", "*", "*", "+", "+", "+", "-", "-"],
mcols=BiocFrame(
{
"score": range(0, 10),
"GC": [random() for _ in range(10)],
}
),
)


def test_split():
assert subject is not None

splits = subject.split(
[
"chr1",
"chr2",
"chr2",
"chr2",
"chr1",
"chr1",
"chr3",
"chr3",
"chr3",
"chr3",
]
)

assert splits is not None
assert isinstance(splits, GenomicRangesList)
assert len(splits) == 3
print(splits.element_nrows())
assert sum(splits.get_range_lengths()) == len(subject)


def test_to_granges():
assert subject is not None

splits = subject.split(
[
"chr1",
"chr2",
"chr2",
"chr2",
"chr1",
"chr1",
"chr3",
"chr3",
"chr3",
"chr3",
]
)

roundtrip = splits.to_genomic_ranges()

assert roundtrip is not None
assert isinstance(roundtrip, GenomicRanges)
assert len(roundtrip) == len(subject)
Loading