Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse RDS containing GenomicRangesList #44

Merged
merged 6 commits into from
Jul 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## Version 0.4.4

- Add methods to parse RDS files containing `GenomicRangesList`
- Fix bug in reading strand information; mostly RLE vectors.
- Update tests and documentation

## Version 0.4.0 - 0.4.3

- Migrate to the new class implementations
Expand Down
52 changes: 45 additions & 7 deletions src/rds2py/granges.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from genomicranges import GenomicRanges, SeqInfo
from iranges import IRanges
from biocframe import BiocFrame
from genomicranges import GenomicRanges, GenomicRangesList, SeqInfo
from iranges import IRanges

from .parser import get_class
from .pdf import as_pandas_from_dframe
Expand Down Expand Up @@ -36,12 +36,16 @@ def as_granges(robj):

_seqnames = _as_list(robj["attributes"]["seqnames"])

_strand_obj = robj["attributes"]["strand"]["attributes"]["values"]
_strands = _strand_obj["data"]
_strands = robj["attributes"]["strand"]
_fstrand = None
if "attributes" in _strands:
if "levels" in _strands["attributes"]:
_levels_data = _strands["attributes"]["levels"]["data"]
_strands = [_levels_data[x] for x in _strands]
_lengths = _strands["attributes"]["lengths"]["data"]
_factors = _strands["attributes"]["values"]["data"]
_levels = _strands["attributes"]["values"]["attributes"]["levels"]["data"]
_strds = [_levels[x - 1] for x in _factors]
_fstrand = []
for i, x in enumerate(_lengths):
_fstrand.extend([_strds[i]] * x)

_seqinfo_seqnames = robj["attributes"]["seqinfo"]["attributes"]["seqnames"]["data"]
_seqinfo_seqlengths = robj["attributes"]["seqinfo"]["attributes"]["seqlengths"][
Expand Down Expand Up @@ -71,6 +75,7 @@ def as_granges(robj):
return GenomicRanges(
seqnames=_seqnames,
ranges=_ranges,
strand=_fstrand,
names=_gr_names,
mcols=_mcols,
seqinfo=_seqinfo,
Expand Down Expand Up @@ -111,3 +116,36 @@ def _as_list(robj):
_data = _final

return _data


def as_granges_list(robj):
"""Parse an R object as a :py:class:`~genomicranges.GenomicRangesList.GenomicRangesList`.

Args:
robj:
Object parsed from the `RDS` file.

Usually the result of :py:func:`~rds2py.parser.read_rds`.

Returns:
A ``GenomicRangesList`` object.
"""

_cls = get_class(robj)

if _cls not in ["CompressedGRangesList", "GRangesList"]:
raise TypeError(f"obj is not genomic ranges list, but is `{_cls}`.")

_gre = as_granges(robj["attributes"]["unlistData"])

_groups = robj["attributes"]["partitioning"]["attributes"]["NAMES"]["data"]
_partitionends = robj["attributes"]["partitioning"]["attributes"]["end"]["data"]

_grelist = []

current = 0
for _pend in _partitionends:
_grelist.append(_gre[current:_pend])
current = _pend

return GenomicRangesList(ranges=_grelist, names=_groups)
Binary file added tests/data/grangeslist.rds
Binary file not shown.
13 changes: 11 additions & 2 deletions tests/test_granges.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pytest

from rds2py.granges import as_granges
from rds2py.granges import as_granges, as_granges_list
from rds2py.parser import read_rds

from genomicranges import GenomicRanges
from genomicranges import GenomicRanges, GenomicRangesList

__author__ = "jkanche"
__copyright__ = "jkanche"
Expand All @@ -16,3 +16,12 @@ def test_granges():
gr = as_granges(robj=robj)

assert isinstance(gr, GenomicRanges)


def test_granges_list():
robj = read_rds("tests/data/grangeslist.rds")

gr = as_granges_list(robj=robj)

assert isinstance(gr, GenomicRangesList)
assert len(gr) == 5
Loading