Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support to read genomic ranges objects #32

Merged
merged 3 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ install_requires =
scipy
singlecellexperiment>=0.4.1
summarizedexperiment>=0.4.1
genomicranges>=0.4.9

[options.packages.find]
where = src
Expand Down
111 changes: 111 additions & 0 deletions src/rds2py/granges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from genomicranges import GenomicRanges, SeqInfo
from iranges import IRanges
from biocframe import BiocFrame

from .parser import get_class
from .pdf import as_pandas_from_dframe

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


def as_granges(robj):
"""Parse an R object as a :py:class:`~genomicranges.GenomicRanges.GenomicRanges`.

Args:
robj:
Object parsed from the `RDS` file.

Usually the result of :py:func:`~rds2py.parser.read_rds`.

Returns:
A ``GenomicRanges`` object.
"""
_cls = get_class(robj)

if _cls not in ["GenomicRanges", "GRanges"]:
raise TypeError(f"obj is not genomic ranges, but is `{_cls}`.")

_range_start = robj["attributes"]["ranges"]["attributes"]["start"]["data"]
_range_width = robj["attributes"]["ranges"]["attributes"]["width"]["data"]
_range_names = None
if "NAMES" in robj["attributes"]["ranges"]["attributes"]:
_range_names = robj["attributes"]["ranges"]["attributes"]["NAMES"]["data"]
_ranges = IRanges(_range_start, _range_width, names=_range_names)

_seqnames = _as_list(robj["attributes"]["seqnames"])

_strand_obj = robj["attributes"]["strand"]["attributes"]["values"]
_strands = _strand_obj["data"]
if "attributes" in _strands:
if "levels" in _strands["attributes"]:
_levels_data = _strands["attributes"]["levels"]["data"]
_strands = [_levels_data[x] for x in _strands]

_seqinfo_seqnames = robj["attributes"]["seqinfo"]["attributes"]["seqnames"]["data"]
_seqinfo_seqlengths = robj["attributes"]["seqinfo"]["attributes"]["seqlengths"][
"data"
]
_seqinfo_is_circular = robj["attributes"]["seqinfo"]["attributes"]["is_circular"][
"data"
]
_seqinfo_genome = robj["attributes"]["seqinfo"]["attributes"]["genome"]["data"]
_seqinfo = SeqInfo(
seqnames=_seqinfo_seqnames,
seqlengths=[None if x == -2147483648 else x for x in _seqinfo_seqlengths],
is_circular=[None if x == -2147483648 else x for x in _seqinfo_is_circular],
genome=_seqinfo_genome,
)

_mcols = BiocFrame.from_pandas(
as_pandas_from_dframe(robj["attributes"]["elementMetadata"])
)

_gr_names = None
if "NAMES" in robj["attributes"]:
_gr_names = robj["attributes"]["NAMES"]["data"]

return GenomicRanges(
seqnames=_seqnames,
ranges=_ranges,
names=_gr_names,
mcols=_mcols,
seqinfo=_seqinfo,
)


def _as_list(robj):
"""Parse an R object as a :py:class:`~list`.

Args:
robj:
Object parsed from the `RDS` file.

Usually the result of :py:func:`~rds2py.parser.read_rds`.

Returns:
A ``list`` of the Rle class.
"""
_cls = get_class(robj)

if _cls not in ["Rle"]:
raise TypeError(f"obj is not Rle, but is `{_cls}`.")

_attr_vals = robj["attributes"]
_data = _attr_vals["values"]["data"].tolist()
if "attributes" in _attr_vals["values"]:
if "levels" in _attr_vals["values"]["attributes"]:
_levels_data = _attr_vals["values"]["attributes"]["levels"]["data"]
_data = [_levels_data[x - 1] for x in _data]

if "lengths" in _attr_vals:
_final = []
_lengths = _attr_vals["lengths"]["data"]

for idx, lg in enumerate(_lengths.tolist()):
_final.extend([_data[idx]] * lg)

_data = _final

return _data
11 changes: 11 additions & 0 deletions tests/data/generate_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,14 @@ saveRDS(y, file="s4_matrix.rds")
setClass("FOO", slots=c(bar="integer"))
y <- new("FOO", bar=2L)
saveRDS(y, file="s4_class.rds")

# GenomicRanges

gr <- GRanges(
seqnames = Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)),
ranges = IRanges(101:110, end = 111:120, names = head(letters, 10)),
strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
score = 1:10,
GC = seq(1, 0, length=10))

saveRDS(gr, file="granges.rds")
Binary file added tests/data/granges.rds
Binary file not shown.
18 changes: 18 additions & 0 deletions tests/test_granges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pytest

from rds2py.granges import as_granges
from rds2py.parser import read_rds

from genomicranges import GenomicRanges

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


def test_granges():
robj = read_rds("tests/data/granges.rds")

gr = as_granges(robj=robj)

assert isinstance(gr, GenomicRanges)
Loading