Methods to split GenomicRanges to GenomicRangesList and vice-versa (

#109) - Method to split `GenomicRanges` by a list of groups. - Coerce `GenomicRangesList` to `GenomicRanges`. - Add tests and documentation.
BiocPy · Jul 12, 2024 · dd21da1 · dd21da1
1 parent c5d0cb9
commit dd21da1
Show file tree

Hide file tree

Showing 5 changed files with 174 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,13 @@
 # Changelog
 
-## Version 0.4.21
+## Version 0.4.25
+
+- Method to split `GenomicRanges` by a list of groups.
+- Coerce `GenomicRangesList` to `GenomicRanges`.
+- Add tests and documentation.
+
+
+## Version 0.4.21 - 0.4.24
 
 - Optimize `intersect` operation on large number of genomic regions
 - Add a `fast_combine_granges` method that only concatenates seqnames and intervals.

diff --git a/src/genomicranges/GenomicRanges.py b/src/genomicranges/GenomicRanges.py
@@ -9,6 +9,7 @@
 from .SeqInfo import SeqInfo, merge_SeqInfo
 from .utils import (
     create_np_vector,
+    group_by_indices,
     sanitize_strand_vector,
     slide_intervals,
     split_intervals,
@@ -331,7 +332,7 @@ def __str__(self) -> str:
             A pretty-printed string containing the contents of this ``GenomicRanges``.
         """
         output = f"GenomicRanges with {len(self)} range{'s' if len(self) != 1 else ''}"
-        output += f" and {len(self._mcols)} metadata column{'s' if len(self._mcols) != 1 else ''}\n"
+        output += f" and {len(self._mcols.get_column_names())} metadata column{'s' if len(self._mcols.get_column_names()) != 1 else ''}\n"
 
         nr = len(self)
         added_table = False
@@ -2892,6 +2893,47 @@ def binned_average(
         output._mcols.set_column(outname, outvec, in_place=True)
         return output
 
+    #######################
+    ######>> split <<######
+    #######################
+
+    def split(self, groups: list) -> "GenomicRangesList":
+        """Split the `GenomicRanges` object into a :py:class:`~genomicranges.GenomicRangesList.GenomicRangesList`.
+
+        Args:
+            groups:
+                A list specifying the groups or factors to split by.
+
+                Must have the same length as the number of genomic elements
+                in the object.
+
+        Returns:
+            A `GenomicRangesList` containing the groups and their
+            corresponding elements.
+        """
+
+        if len(groups) != len(self):
+            raise ValueError(
+                "Number of groups must match the number of genomic elements."
+            )
+
+        gdict = group_by_indices(groups=groups)
+
+        _names = []
+        _grs = []
+
+        for k, v in gdict.items():
+            _names.append(k)
+            _grs.append(self[v])
+
+        from .GenomicRangesList import GenomicRangesList
+
+        return GenomicRangesList(ranges=_grs, names=_names)
+
+    #######################
+    ######>> empty <<######
+    #######################
+
     @classmethod
     def empty(cls):
         """Create an zero-length `GenomicRanges` object.

diff --git a/src/genomicranges/GenomicRangesList.py b/src/genomicranges/GenomicRangesList.py
@@ -283,7 +283,7 @@ def __str__(self) -> str:
         output = (
             f"GenomicRangesList with {len(self)} range{'s' if len(self) != 1 else ''}"
         )
-        output += f" and {len(self._mcols)} metadata column{'s' if len(self._mcols) != 1 else ''}\n"
+        output += f" and {len(self._mcols.get_column_names())} metadata column{'s' if len(self._mcols.get_column_names()) != 1 else ''}\n"
 
         if isinstance(self._ranges, GenomicRanges) and len(self._ranges) == 0:
             output += "--- empty genomic ranges list ---"
@@ -729,7 +729,7 @@ def range_lengths(self) -> dict:
     ###################################
 
     def to_pandas(self) -> "pandas.DataFrame":
-        """Coerce object to a :py:class:`pandas.DataFrame`.
+        """Coerce object to a :py:class:`~pandas.DataFrame`.
 
         Returns:
             A :py:class:`~pandas.DataFrame` object.
@@ -851,6 +851,32 @@ def empty(cls, n: int):
 
         return cls(ranges=GenomicRanges.empty(), range_lengths=_range_lengths)
 
+    ###############################
+    ######>> to granges <<#########
+    ###############################
+
+    def to_genomic_ranges(self) -> GenomicRanges:
+        """Coerce object to a :py:class:`~genomicranges.GenomicRanges.GenomicRanges`.
+
+        Returns:
+            A :py:class:`~genomicranges.GenomicRanges.GenomicRanges` object.
+        """
+        _combined_ranges = ut.combine_sequences(*self._ranges)
+        _combined_names = None
+
+        if self._names is not None:
+            _combined_names = []
+            for i, rl in enumerate(self._range_lengths):
+                _combined_names.extend([self._names[i]] * rl)
+
+            return _combined_ranges.set_names(_combined_names)
+
+        return _combined_ranges
+
+    def to_granges(self) -> GenomicRanges:
+        """Alias to :py:meth:`~to_genomic_ranges`."""
+        return self.to_genomic_ranges()
+
 
 @ut.combine_sequences.register(GenomicRangesList)
 def _combine_grl(*x: GenomicRangesList):

diff --git a/src/genomicranges/utils.py b/src/genomicranges/utils.py
@@ -1,3 +1,4 @@
+from itertools import groupby
 from typing import List, Optional, Sequence, Tuple, Union
 
 import biocutils as ut
@@ -206,3 +207,12 @@ def create_np_vector(
             _ = [revmap[x].append(idx + 1) for x in range(i[0] - 1, i[1])]
 
     return cov, revmap
+
+
+def group_by_indices(groups: list) -> dict:
+    return {
+        k: [x[0] for x in v]
+        for k, v in groupby(
+            sorted(enumerate(groups), key=lambda x: x[1]), lambda x: x[1]
+        )
+    }
diff --git a/tests/test_gr_to_grl.py b/tests/test_gr_to_grl.py
@@ -0,0 +1,85 @@
+import pytest
+import pandas as pd
+import numpy as np
+from genomicranges import GenomicRanges, GenomicRangesList
+from biocframe import BiocFrame
+from iranges import IRanges
+from random import random
+import genomicranges
+
+__author__ = "jkanche"
+__copyright__ = "jkanche"
+__license__ = "MIT"
+
+subject = GenomicRanges(
+    seqnames=[
+        "chr1",
+        "chr2",
+        "chr2",
+        "chr2",
+        "chr1",
+        "chr1",
+        "chr3",
+        "chr3",
+        "chr3",
+        "chr3",
+    ],
+    ranges=IRanges(range(101, 111), range(121, 131)),
+    strand=["*", "-", "-", "*", "*", "+", "+", "+", "-", "-"],
+    mcols=BiocFrame(
+        {
+            "score": range(0, 10),
+            "GC": [random() for _ in range(10)],
+        }
+    ),
+)
+
+
+def test_split():
+    assert subject is not None
+
+    splits = subject.split(
+        [
+            "chr1",
+            "chr2",
+            "chr2",
+            "chr2",
+            "chr1",
+            "chr1",
+            "chr3",
+            "chr3",
+            "chr3",
+            "chr3",
+        ]
+    )
+
+    assert splits is not None
+    assert isinstance(splits, GenomicRangesList)
+    assert len(splits) == 3
+    print(splits.element_nrows())
+    assert sum(splits.get_range_lengths()) == len(subject)
+
+
+def test_to_granges():
+    assert subject is not None
+
+    splits = subject.split(
+        [
+            "chr1",
+            "chr2",
+            "chr2",
+            "chr2",
+            "chr1",
+            "chr1",
+            "chr3",
+            "chr3",
+            "chr3",
+            "chr3",
+        ]
+    )
+
+    roundtrip = splits.to_genomic_ranges()
+
+    assert roundtrip is not None
+    assert isinstance(roundtrip, GenomicRanges)
+    assert len(roundtrip) == len(subject)