diff --git a/CHANGELOG.md b/CHANGELOG.md index 84daa5b..2200fc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## Version 0.4.4 + +- Add methods to parse RDS files containing `GenomicRangesList` +- Fix bug in reading strand information; mostly RLE vectors. +- Update tests and documentation + ## Version 0.4.0 - 0.4.3 - Migrate to the new class implementations diff --git a/src/rds2py/granges.py b/src/rds2py/granges.py index 3347a4f..7e7081b 100644 --- a/src/rds2py/granges.py +++ b/src/rds2py/granges.py @@ -1,6 +1,6 @@ -from genomicranges import GenomicRanges, SeqInfo -from iranges import IRanges from biocframe import BiocFrame +from genomicranges import GenomicRanges, GenomicRangesList, SeqInfo +from iranges import IRanges from .parser import get_class from .pdf import as_pandas_from_dframe @@ -36,12 +36,16 @@ def as_granges(robj): _seqnames = _as_list(robj["attributes"]["seqnames"]) - _strand_obj = robj["attributes"]["strand"]["attributes"]["values"] - _strands = _strand_obj["data"] + _strands = robj["attributes"]["strand"] + _fstrand = None if "attributes" in _strands: - if "levels" in _strands["attributes"]: - _levels_data = _strands["attributes"]["levels"]["data"] - _strands = [_levels_data[x] for x in _strands] + _lengths = _strands["attributes"]["lengths"]["data"] + _factors = _strands["attributes"]["values"]["data"] + _levels = _strands["attributes"]["values"]["attributes"]["levels"]["data"] + _strds = [_levels[x - 1] for x in _factors] + _fstrand = [] + for i, x in enumerate(_lengths): + _fstrand.extend([_strds[i]] * x) _seqinfo_seqnames = robj["attributes"]["seqinfo"]["attributes"]["seqnames"]["data"] _seqinfo_seqlengths = robj["attributes"]["seqinfo"]["attributes"]["seqlengths"][ @@ -71,6 +75,7 @@ def as_granges(robj): return GenomicRanges( seqnames=_seqnames, ranges=_ranges, + strand=_fstrand, names=_gr_names, mcols=_mcols, seqinfo=_seqinfo, @@ -111,3 +116,36 @@ def _as_list(robj): _data = _final return _data + + +def as_granges_list(robj): + """Parse an R object as a :py:class:`~genomicranges.GenomicRangesList.GenomicRangesList`. + + Args: + robj: + Object parsed from the `RDS` file. + + Usually the result of :py:func:`~rds2py.parser.read_rds`. + + Returns: + A ``GenomicRangesList`` object. + """ + + _cls = get_class(robj) + + if _cls not in ["CompressedGRangesList", "GRangesList"]: + raise TypeError(f"obj is not genomic ranges list, but is `{_cls}`.") + + _gre = as_granges(robj["attributes"]["unlistData"]) + + _groups = robj["attributes"]["partitioning"]["attributes"]["NAMES"]["data"] + _partitionends = robj["attributes"]["partitioning"]["attributes"]["end"]["data"] + + _grelist = [] + + current = 0 + for _pend in _partitionends: + _grelist.append(_gre[current:_pend]) + current = _pend + + return GenomicRangesList(ranges=_grelist, names=_groups) diff --git a/tests/data/grangeslist.rds b/tests/data/grangeslist.rds new file mode 100644 index 0000000..5bf7bcd Binary files /dev/null and b/tests/data/grangeslist.rds differ diff --git a/tests/test_granges.py b/tests/test_granges.py index 4d6f94a..f64cf8a 100644 --- a/tests/test_granges.py +++ b/tests/test_granges.py @@ -1,9 +1,9 @@ import pytest -from rds2py.granges import as_granges +from rds2py.granges import as_granges, as_granges_list from rds2py.parser import read_rds -from genomicranges import GenomicRanges +from genomicranges import GenomicRanges, GenomicRangesList __author__ = "jkanche" __copyright__ = "jkanche" @@ -16,3 +16,12 @@ def test_granges(): gr = as_granges(robj=robj) assert isinstance(gr, GenomicRanges) + + +def test_granges_list(): + robj = read_rds("tests/data/grangeslist.rds") + + gr = as_granges_list(robj=robj) + + assert isinstance(gr, GenomicRangesList) + assert len(gr) == 5