diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml index 2717d14..1381761 100644 --- a/.github/workflows/build-docs.yml +++ b/.github/workflows/build-docs.yml @@ -29,7 +29,7 @@ jobs: - name: Build docs run: | python setup.py build_ext --inplace - cp build/lib*/rds2py/rds_parser* src/rds2py/ + cp build/lib*/rds2py/lib_rds_parser* src/rds2py/ tox -e docs touch ./docs/_build/html/.nojekyll diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml index bdbb89c..abc9222 100644 --- a/.github/workflows/publish-pypi.yml +++ b/.github/workflows/publish-pypi.yml @@ -28,7 +28,7 @@ jobs: build_macosx_x86_64: name: Build wheels for macosx x86_64 - runs-on: macos-11 + runs-on: macos-13 steps: - name: Check out repository uses: actions/checkout@v3 diff --git a/.gitignore b/.gitignore index e9e1e9b..6b070ba 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,6 @@ MANIFEST .venv*/ .conda*/ .python-version + +extern/rds2cpp* +src/rds2py/lib/parser.cpp diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 96cab27..37f69d4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,11 +25,11 @@ repos: args: [--in-place, --wrap-descriptions=120, --wrap-summaries=120] # --config, ./pyproject.toml -- repo: https://github.com/psf/black - rev: 24.8.0 - hooks: - - id: black - language_version: python3 +# - repo: https://github.com/psf/black +# rev: 24.8.0 +# hooks: +# - id: black +# language_version: python3 - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. @@ -37,6 +37,8 @@ repos: hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] + # Run the formatter. + - id: ruff-format ## If like to embrace black styles even in the docs: # - repo: https://github.com/asottile/blacken-docs diff --git a/AUTHORS.md b/AUTHORS.md index f635b91..f21a024 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -1,3 +1,3 @@ # Contributors -* jkanche [jayaram.kancherla@gmail.com](mailto:jayaram.kancherla@gmail.com) +* Jayaram Kancherla [jayaram.kancherla@gmail.com](mailto:jayaram.kancherla@gmail.com) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac57df2..7790bf9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,17 @@ # Changelog -## Development +## Version 0.5.0 -- Fix github issue with showing incorrect package version on github pages. +- Complete overhaul of the codebase using pybind11 +- Streamlined readers for R data types +- Updated API for all classes and methods +- Updated documentation and tests. ## Version 0.4.5 - Switch to pybind11 to implementing the bindings to rds2cpp. - Update tests, documentation and actions. +- Fix github issue with showing incorrect package version on github pages. ## Version 0.4.4 diff --git a/README.md b/README.md index f0ca25d..968f315 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,39 @@ # rds2py -Parse and construct Python representations for datasets stored in RDS files. `rds2py` supports a few base classes from R and Bioconductor's `SummarizedExperiment` and `SingleCellExperiment` S4 classes. **_This is possible because of [Aaron's rds2cpp library](https://github.com/LTLA/rds2cpp)._** - -The package uses memory views (except for strings) to access the same memory from C++ in Python (through Cython of course). This is especially useful for large datasets so we don't make multiple copies of data. - -## Install +Parse and construct Python representations for datasets stored in RDS files. `rds2py` supports various base classes from R, and Bioconductor's `SummarizedExperiment` and `SingleCellExperiment` S4 classes. ***For more details, check out [rds2cpp library](https://github.com/LTLA/rds2cpp).*** + +> **Important Version Notice** +> +> Version 0.5.0 brings major changes to the package: +> - Complete overhaul of the codebase using pybind11 +> - Streamlined readers for R data types +> - Updated API for all classes and methods +> +> Please refer to the [documentation](https://biocpy.github.io/rds2py/) for the latest usage guidelines. Previous versions may have incompatible APIs. + +The package provides: + +- Efficient parsing of RDS files with *minimal* memory overhead +- Support for R's basic data types and complex S4 objects + - Vectors (numeric, character, logical) + - Factors + - Data frames + - Matrices (dense and sparse) + - Run-length encoded vectors (Rle) +- Conversion to appropriate Python/NumPy/SciPy data structures + - dgCMatrix (sparse column matrix) + - dgRMatrix (sparse row matrix) + - dgTMatrix (sparse triplet matrix) +- Preservation of metadata and attributes from R objects +- Integration with BiocPy ecosystem for Bioconductor classes + - SummarizedExperiment + - RangedSummarizedExperiment + - SingleCellExperiment + - GenomicRanges + - MultiAssayExperiment + +## Installation Package is published to [PyPI](https://pypi.org/project/rds2py/) @@ -16,57 +44,64 @@ Package is published to [PyPI](https://pypi.org/project/rds2py/) pip install rds2py ``` -## Usage - -If you do not have an RDS object handy, feel free to download one from [single-cell-test-files](https://github.com/jkanche/random-test-files/releases). +## Quick Start ```python -from rds2py import as_summarized_experiment, read_rds +from rds2py import read_rds -r_obj = read_rds() +# Read any RDS file +r_obj = read_rds("path/to/file.rds") ``` -This `r_obj` holds a dictionary representation of the RDS file, we can now transform this object into Python representations. - -`rObj` always contains two keys +## Usage -- `data`: If atomic entities, contains the NumPy view of the array. -- `attributes`: Additional properties available for the object. +If you do not have an RDS object handy, feel free to download one from [single-cell-test-files](https://github.com/jkanche/random-test-files/releases). -In addition, the package provides functions to convert parsed R objects into Python representations. +### Basic Usage ```python -from rds2py import as_spase_matrix, as_summarized_experiment - -# to convert an robject to a sparse matrix -sp_mat = as_sparse(rObj) - -# to convert an robject to SCE -sce = as_summarized_experiment(rObj) +from rds2py import read_rds +r_obj = read_rds("path/to/file.rds") ``` -For more examples converting `data.frame`, `dgCMatrix`, `dgRMatrix`, `dgTMatrix` to Python, checkout the [documentation](https://biocpy.github.io/rds2py/). +The returned `r_obj` either returns an appropriate Python class if a parser is already implemented or returns the dictionary containing the data from the RDS file. -## Developer Notes +## Write-your-own-reader -This project uses Cython to provide bindings from C++ to Python. +In addition, the package provides the dictionary representation of the RDS file, allowing users to write their own custom readers into appropriate Python representations. -Steps to setup dependencies - +```python +from rds2py import parse_rds -- git submodules is initialized in `extern/rds2cpp` -- `cmake .` in `extern/rds2cpp` directory to download dependencies, especially the `byteme` library +data = parse_rds("path/to/file.rds") +print(data) +``` -First one needs to build the extern library, this would generate a shared object file to `src/rds2py/core-[*].so` +if you know this RDS file contains an `GenomicRanges` object, you can use or modify the provided list reader, or write your own parser to convert this dictionary. -```shell -python setup.py build_ext --inplace +```python +from rds2py.read_granges import read_genomic_ranges + +gr = read_genomic_ranges(data) ``` -For typical development workflows, run +## Type Conversion Reference -```shell -python setup.py build_ext --inplace && tox -``` +| R Type | Python/NumPy Type | +|--------|------------------| +| numeric | numpy.ndarray (float64) | +| integer | numpy.ndarray (int32) | +| character | list of str | +| logical | numpy.ndarray (bool) | +| factor | list | +| data.frame | BiocFrame | +| matrix | numpy.ndarray or scipy.sparse matrix | +| dgCMatrix | scipy.sparse.csc_matrix | +| dgRMatrix | scipy.sparse.csr_matrix | + +## Developer Notes + +This project uses pybind11 to provide bindings to the rds2cpp library. Please make sure necessary C++ compiler is installed on your system. diff --git a/docs/tutorial.md b/docs/tutorial.md index fdcad60..dbabf6d 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -2,64 +2,40 @@ If you do not have an RDS object handy, feel free to download one from [single-cell-test-files](https://github.com/jkanche/random-test-files/releases). -## Step 1: Read a RDS file in Python - -First we need to read the RDS file that can be easily explored in Python. The `read_rds` parses the R object and returns -a dictionary of the R object. +### Basic Usage ```python from rds2py import read_rds - -rObj = read_rds() +r_obj = read_rds("path/to/file.rds") ``` -Once we have a realized structure, we can now convert this object to useful Python representations. It contains two keys - -- `data`: If atomic entities, contains the numpy view of the memory space. -- `attributes`: Additional properties available for the object. - -The package provides friendly functions to convert some R representations to useful Python representations. - -## Step 2: Python representations +The returned `r_obj` either returns an appropriate Python class if a parser is already implemented or returns the dictionary containing the data from the RDS file. -### Matrices +## Write-your-own-reader -Use these methods if the RDS file contains either a sparse matrix (`dgCMatrix`, `dgRMatrix`, or `dgTMatrix`) or a dense matrix. - -**_Note: If an R object contains `dims` in the `attributes`, we consider this as a matrix._** +In addition, the package provides the dictionary representation of the RDS file, allowing users to write their own custom readers into appropriate Python representations. ```python -from rds2py import as_spase_matrix, as_dense_matrix - -# to convert an robject to a sparse matrix -sp_mat = as_sparse_matrix(rObj) - -# to convert an robject to a sparse matrix -dense_mat = as_dense_matrix(rObj) -``` - -### Pandas DataFrame +from rds2py import parse_rds -Methods are available to construct a pandas `DataFrame` from data stored in an RDS file. The package supports two R classes for this operation - `data.frame` and `DFrame` classes. - -```python -from rds2py import as_pandas - -# to convert an robject to DF -df = as_pandas(rObj) -``` - -### S4 classes: specifically `SingleCellExperiment` or `SummarizedExperiment` - -We also support `SingleCellExperiment` or `SummarizedExperiment` from Bioconductor. the `as_summarized_experiment` method is how we one can do this operation. - -**_Note: This method also serves as an example on how to convert complex R structures into Python representations._** - -```python -from rds2py import as_summarized_experiment +data = parse_rds("path/to/file.rds") +print(data) -# to convert an robject to SCE -sp_mat = as_summarized_experiment(rObj) +# now write your own parser to convert this dictionary. ``` -Well thats it, hack on & create more base representations to encapsulate complex structures. If you want to add more representations, feel free to send a PR! +## Type Conversion Reference + +| R Type | Python/NumPy Type | +|--------|------------------| +| numeric | numpy.ndarray (float64) | +| integer | numpy.ndarray (int32) | +| character | list of str | +| logical | numpy.ndarray (bool) | +| factor | list | +| data.frame | BiocFrame | +| matrix | numpy.ndarray or scipy.sparse matrix | +| dgCMatrix | scipy.sparse.csc_matrix | +| dgRMatrix | scipy.sparse.csr_matrix | + +Check out the module reference for more information on these classes. diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index c38c3b8..c8fc51b 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -31,6 +31,6 @@ set_property(TARGET ${TARGET} PROPERTY CXX_STANDARD 17) target_link_libraries(${TARGET} PRIVATE rds2cpp pybind11::pybind11) set_target_properties(${TARGET} PROPERTIES - OUTPUT_NAME rds_parser + OUTPUT_NAME lib_rds_parser PREFIX "" ) diff --git a/lib/src/rdswrapper.cpp b/lib/src/rdswrapper.cpp index 2c9dcfa..1b52da2 100644 --- a/lib/src/rdswrapper.cpp +++ b/lib/src/rdswrapper.cpp @@ -20,12 +20,12 @@ class RdsReader { if (!ptr) throw std::runtime_error("Null pointer in 'get_rtype'."); // py::print("arg::", static_cast(ptr->type())); switch (ptr->type()) { + case rds2cpp::SEXPType::S4: return "S4"; case rds2cpp::SEXPType::INT: return "integer"; case rds2cpp::SEXPType::REAL: return "double"; case rds2cpp::SEXPType::STR: return "string"; case rds2cpp::SEXPType::LGL: return "boolean"; case rds2cpp::SEXPType::VEC: return "vector"; - case rds2cpp::SEXPType::S4: return "S4"; case rds2cpp::SEXPType::NIL: return "null"; default: return "other"; } @@ -164,7 +164,7 @@ class RdsObject { } }; -PYBIND11_MODULE(rds_parser, m) { +PYBIND11_MODULE(lib_rds_parser, m) { py::register_exception(m, "RdsParserError"); py::class_(m, "RdsObject") diff --git a/setup.cfg b/setup.cfg index 623f78c..09acc5f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,7 +5,7 @@ [metadata] name = rds2py -description = Parse and read RDS files as Python representations +description = Parse and construct Python representations for datasets stored in RDS files author = jkanche author_email = jayaram.kancherla@gmail.com license = MIT @@ -50,11 +50,13 @@ python_requires = >=3.8 install_requires = importlib-metadata; python_version<"3.8" numpy - pandas scipy + biocutils>=0.1.5 singlecellexperiment>=0.4.1 summarizedexperiment>=0.4.1 genomicranges>=0.4.9 + biocframe + multiassayexperiment [options.packages.find] where = src @@ -65,17 +67,14 @@ exclude = # Add here additional requirements for extra features, to install with: # `pip install rds2py[PDF]` like: # PDF = ReportLab; RXP +optional = + pandas # Add here test requirements (semicolon/line-separated) testing = setuptools pytest pytest-cov - numpy - pandas - scipy - singlecellexperiment - summarizedexperiment [options.entry_points] # Add here console scripts like: diff --git a/setup.py b/setup.py index 8649f6a..f495462 100644 --- a/setup.py +++ b/setup.py @@ -38,10 +38,7 @@ def build_cmake(self, ext): "lib", "-B", build_temp, - "-Dpybind11_DIR=" - + os.path.join( - os.path.dirname(pybind11.__file__), "share", "cmake", "pybind11" - ), + "-Dpybind11_DIR=" + os.path.join(os.path.dirname(pybind11.__file__), "share", "cmake", "pybind11"), "-DPYTHON_EXECUTABLE=" + sys.executable, ] if os.name != "nt": diff --git a/src/rds2py/PyRdsReader.py b/src/rds2py/PyRdsReader.py index 5219d31..166719b 100644 --- a/src/rds2py/PyRdsReader.py +++ b/src/rds2py/PyRdsReader.py @@ -1,31 +1,75 @@ -from .rds_parser import RdsObject, RdsReader -import numpy as np -from typing import Dict, Any, List, Union +"""Low-level interface for reading RDS file format. + +This module provides the core functionality for parsing RDS files at a binary level and converting them into a +dictionary representation that can be further processed by higher-level functions. +""" + +from typing import Any, Dict, List, Union from warnings import warn +import numpy as np + +from .lib_rds_parser import RdsObject, RdsReader + class PyRdsParserError(Exception): + """Exception raised for errors during RDS parsing.""" + pass class PyRdsParser: - """Python bindings to the rds2cpp interface.""" + """Parser for reading RDS files. + + This class provides low-level access to RDS file contents, handling the binary + format and converting it into Python data structures. It supports various R + data types and handles special R cases like NA values, integer sequences and + range functions. + + Attributes: + R_MIN: + Minimum integer value in R, used for handling NA values. + + rds_object: + Internal representation of the RDS file. + + root_object: + Root object of the parsed RDS file. + """ + + R_MIN: int = -2147483648 def __init__(self, file_path: str): + """Initialize the class. + + Args: + file_path: + Path to the RDS file to be read. + """ try: self.rds_object = RdsObject(file_path) robject = self.rds_object.get_robject() + if not isinstance(robject, RdsReader): raise TypeError(f"Expected 'RdsReader' object, got {type(robject)}") + self.root_object = robject except Exception as e: raise PyRdsParserError(f"Error initializing 'PyRdsParser': {str(e)}") def parse(self) -> Dict[str, Any]: - """Parse the RDS File (recursively). + """Parse the entire RDS file into a dictionary structure. Returns: - A Dictionary with object attributes as keys and the value representing the data from the RDS file. + A dictionary containing the parsed data with keys: + - 'type': The R object type + - 'data': The actual data (if applicable) + - 'attributes': R object attributes (if any) + - 'class_name': The R class name + - Additional keys depending on the object type + + Raises: + PyRdsParserError: If there's an error during parsing. """ try: return self._process_object(self.root_object) @@ -37,44 +81,66 @@ def _process_object(self, obj: RdsReader) -> Dict[str, Any]: rtype = obj.get_rtype() result: Dict[str, Any] = {"type": rtype} - if rtype in ["integer", "boolean", "double"]: - result["data"] = self._get_numeric_data(obj, rtype) + if rtype == "S4": + result["package_name"] = obj.get_package_name() + result["class_name"] = obj.get_class_name() + result["attributes"] = self._process_attributes(obj) + elif rtype in ["integer", "boolean", "double"]: + result["data"] = self._handle_r_special_cases( + self._get_numeric_data(obj, rtype), rtype, obj.get_rsize() + ) result["attributes"] = self._process_attributes(obj) + result["class_name"] = f"{rtype}_vector" elif rtype == "string": result["data"] = obj.get_string_arr() + result["class_name"] = "string_vector" elif rtype == "vector": result["data"] = self._process_vector(obj) result["attributes"] = self._process_attributes(obj) - elif rtype == "S4": - result["package_name"] = obj.get_package_name() - result["class_name"] = obj.get_class_name() - result["attributes"] = self._process_attributes(obj) + result["class_name"] = "vector" elif rtype == "null": pass else: # raise ValueError - warn(f"Unsupported R object type: {rtype}") + warn(f"Unsupported R object type: {rtype}", RuntimeWarning) result["data"] = None result["attributes"] = None + result["class_name"] = None return result except Exception as e: raise PyRdsParserError(f"Error processing object: {str(e)}") + def _handle_r_special_cases(self, data: np.ndarray, rtype: str, size: int) -> Union[np.ndarray, range]: + """Handle special R data representations.""" + try: + # Special handling for R integer containing NA + if size != 2: + if any(data == self.R_MIN): + return np.array([np.nan if x == self.R_MIN else x for x in data]) + + # Special handling for R integer sequences + if rtype == "integer" and size == 2 and data[0] == self.R_MIN and data[1] < 0: + if data[1] == self.R_MIN: + return [None, None] + return range(data[1] * -1) + + return data + except Exception as e: + raise PyRdsParserError(f"Error handling R special cases: {str(e)}") + def _get_numeric_data(self, obj: RdsReader, rtype: str) -> np.ndarray: try: data = obj.get_numeric_data() if rtype == "boolean": return data.astype(bool) + return data except Exception as e: raise PyRdsParserError(f"Error getting numeric data: {str(e)}") def _process_vector(self, obj: RdsReader) -> List[Dict[str, Any]]: - return [ - self._process_object(obj.load_vec_element(i)) - for i in range(obj.get_rsize()) - ] + return [self._process_object(obj.load_vec_element(i)) for i in range(obj.get_rsize())] def _process_attributes(self, obj: RdsReader) -> Dict[str, Dict[str, Any]]: try: @@ -82,6 +148,7 @@ def _process_attributes(self, obj: RdsReader) -> Dict[str, Dict[str, Any]]: for name in obj.get_attribute_names(): attr_obj = obj.load_attribute_by_name(name) attributes[name] = self._process_object(attr_obj) + return attributes except Exception as e: raise PyRdsParserError(f"Error processing attributes: {str(e)}") diff --git a/src/rds2py/__init__.py b/src/rds2py/__init__.py index d16b158..f64e9e8 100644 --- a/src/rds2py/__init__.py +++ b/src/rds2py/__init__.py @@ -15,13 +15,5 @@ finally: del version, PackageNotFoundError -# from .core import * - -from .interface import ( - as_dense_matrix, - as_sparse_matrix, - as_pandas, - as_summarized_experiment, -) - -from .parser import read_rds, get_class +from .generics import read_rds +from .rdsutils import parse_rds diff --git a/src/rds2py/generics.py b/src/rds2py/generics.py new file mode 100644 index 0000000..565d1cd --- /dev/null +++ b/src/rds2py/generics.py @@ -0,0 +1,140 @@ +"""Core functionality for reading RDS files in Python. + +This module provides the main interface for reading RDS files and converting them +to appropriate Python objects. It maintains a registry of supported R object types +and their corresponding Python parser functions. + +The module supports various R object types including vectors, matrices, data frames, +and specialized Bioconductor objects like GenomicRanges and SummarizedExperiment. + +Example: + + .. code-block:: python + + data = read_rds("example.rds") + print(type(data)) +""" + +from importlib import import_module +from warnings import warn + +from .rdsutils import get_class, parse_rds + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + +REGISTRY = { + # typed vectors + "integer_vector": "rds2py.read_atomic.read_integer_vector", + "boolean_vector": "rds2py.read_atomic.read_boolean_vector", + "string_vector": "rds2py.read_atomic.read_string_vector", + "double_vector": "rds2py.read_atomic.read_double_vector", + # dictionary + "vector": "rds2py.read_dict.read_dict", + # factors + "factor": "rds2py.read_factor.read_factor", + # Rle + "Rle": "rds2py.read_rle.read_rle", + # matrices + "dgCMatrix": "rds2py.read_matrix.read_dgcmatrix", + "dgRMatrix": "rds2py.read_matrix.read_dgrmatrix", + "dgTMatrix": "rds2py.read_matrix.read_dgtmatrix", + "ndarray": "rds2py.read_matrix.read_ndarray", + # data frames + "data.frame": "rds2py.read_frame.read_data_frame", + "DFrame": "rds2py.read_frame.read_dframe", + # genomic ranges + "GRanges": "rds2py.read_granges.read_genomic_ranges", + "GenomicRanges": "rds2py.read_granges.read_genomic_ranges", + "CompressedGRangesList": "rds2py.read_granges.read_granges_list", + "GRangesList": "rds2py.read_granges.read_granges_list", + # summarized experiment + "SummarizedExperiment": "rds2py.read_se.read_summarized_experiment", + "RangedSummarizedExperiment": "rds2py.read_se.read_ranged_summarized_experiment", + # single-cell experiment + "SingleCellExperiment": "rds2py.read_sce.read_single_cell_experiment", + "SummarizedExperimentByColumn": "rds2py.read_sce.read_alts_summarized_experiment_by_column", + # multi assay experiment + "MultiAssayExperiment": "rds2py.read_mae.read_multi_assay_experiment", + "ExperimentList": "rds2py.read_dict.read_dict", +} + + +# @singledispatch +# def save_rds(x, path: str): +# """Save a Python object as RDS file. + +# Args: +# x: +# Object to save. + +# path: +# Path to save the object. +# """ +# raise NotImplementedError( +# f"No `save_rds` method implemented for '{type(x).__name__}' objects." +# ) + + +def read_rds(path: str, **kwargs): + """Read an RDS file and convert it to an appropriate Python object. + + Args: + path: + Path to the RDS file to be read. + + **kwargs: + Additional arguments passed to specific parser functions. + + Returns: + A Python object representing the data in the RDS file. The exact type + depends on the contents of the RDS file and the available parsers. + """ + _robj = parse_rds(path=path) + return _dispatcher(_robj, **kwargs) + + +def _dispatcher(robject: dict, **kwargs): + """Internal function to dispatch R objects to appropriate parser functions. + + Args: + robject: + Dictionary containing parsed R object data. + + **kwargs: + Additional arguments passed to specific parser functions. + + Returns: + Parsed Python object corresponding to the R data structure. + Returns the original dictionary if no appropriate parser is found. + """ + _class_name = get_class(robject) + + if _class_name is None: + return None + + # if a class is registered, coerce the object + # to the representation. + if _class_name in REGISTRY: + try: + command = REGISTRY[_class_name] + if isinstance(command, str): + last_period = command.rfind(".") + mod = import_module(command[:last_period]) + command = getattr(mod, command[last_period + 1 :]) + REGISTRY[_class_name] = command + + return command(robject, **kwargs) + except Exception as e: + warn( + f"Failed to coerce RDS object to class: '{_class_name}', returning the dictionary, {str(e)}", + RuntimeWarning, + ) + else: + warn( + f"RDS file contains an unknown class: '{_class_name}', returning the dictionary", + RuntimeWarning, + ) + + return robject diff --git a/src/rds2py/granges.py b/src/rds2py/granges.py deleted file mode 100644 index 7e7081b..0000000 --- a/src/rds2py/granges.py +++ /dev/null @@ -1,151 +0,0 @@ -from biocframe import BiocFrame -from genomicranges import GenomicRanges, GenomicRangesList, SeqInfo -from iranges import IRanges - -from .parser import get_class -from .pdf import as_pandas_from_dframe - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def as_granges(robj): - """Parse an R object as a :py:class:`~genomicranges.GenomicRanges.GenomicRanges`. - - Args: - robj: - Object parsed from the `RDS` file. - - Usually the result of :py:func:`~rds2py.parser.read_rds`. - - Returns: - A ``GenomicRanges`` object. - """ - _cls = get_class(robj) - - if _cls not in ["GenomicRanges", "GRanges"]: - raise TypeError(f"obj is not genomic ranges, but is `{_cls}`.") - - _range_start = robj["attributes"]["ranges"]["attributes"]["start"]["data"] - _range_width = robj["attributes"]["ranges"]["attributes"]["width"]["data"] - _range_names = None - if "NAMES" in robj["attributes"]["ranges"]["attributes"]: - _range_names = robj["attributes"]["ranges"]["attributes"]["NAMES"]["data"] - _ranges = IRanges(_range_start, _range_width, names=_range_names) - - _seqnames = _as_list(robj["attributes"]["seqnames"]) - - _strands = robj["attributes"]["strand"] - _fstrand = None - if "attributes" in _strands: - _lengths = _strands["attributes"]["lengths"]["data"] - _factors = _strands["attributes"]["values"]["data"] - _levels = _strands["attributes"]["values"]["attributes"]["levels"]["data"] - _strds = [_levels[x - 1] for x in _factors] - _fstrand = [] - for i, x in enumerate(_lengths): - _fstrand.extend([_strds[i]] * x) - - _seqinfo_seqnames = robj["attributes"]["seqinfo"]["attributes"]["seqnames"]["data"] - _seqinfo_seqlengths = robj["attributes"]["seqinfo"]["attributes"]["seqlengths"][ - "data" - ] - _seqinfo_is_circular = robj["attributes"]["seqinfo"]["attributes"]["is_circular"][ - "data" - ] - _seqinfo_genome = robj["attributes"]["seqinfo"]["attributes"]["genome"]["data"] - _seqinfo = SeqInfo( - seqnames=_seqinfo_seqnames, - seqlengths=[None if x == -2147483648 else int(x) for x in _seqinfo_seqlengths], - is_circular=[ - None if x == -2147483648 else bool(x) for x in _seqinfo_is_circular - ], - genome=_seqinfo_genome, - ) - - _mcols = BiocFrame.from_pandas( - as_pandas_from_dframe(robj["attributes"]["elementMetadata"]) - ) - - _gr_names = None - if "NAMES" in robj["attributes"]: - _gr_names = robj["attributes"]["NAMES"]["data"] - - return GenomicRanges( - seqnames=_seqnames, - ranges=_ranges, - strand=_fstrand, - names=_gr_names, - mcols=_mcols, - seqinfo=_seqinfo, - ) - - -def _as_list(robj): - """Parse an R object as a :py:class:`~list`. - - Args: - robj: - Object parsed from the `RDS` file. - - Usually the result of :py:func:`~rds2py.parser.read_rds`. - - Returns: - A ``list`` of the Rle class. - """ - _cls = get_class(robj) - - if _cls not in ["Rle"]: - raise TypeError(f"obj is not Rle, but is `{_cls}`.") - - _attr_vals = robj["attributes"] - _data = _attr_vals["values"]["data"].tolist() - if "attributes" in _attr_vals["values"]: - if "levels" in _attr_vals["values"]["attributes"]: - _levels_data = _attr_vals["values"]["attributes"]["levels"]["data"] - _data = [_levels_data[x - 1] for x in _data] - - if "lengths" in _attr_vals: - _final = [] - _lengths = _attr_vals["lengths"]["data"] - - for idx, lg in enumerate(_lengths.tolist()): - _final.extend([_data[idx]] * lg) - - _data = _final - - return _data - - -def as_granges_list(robj): - """Parse an R object as a :py:class:`~genomicranges.GenomicRangesList.GenomicRangesList`. - - Args: - robj: - Object parsed from the `RDS` file. - - Usually the result of :py:func:`~rds2py.parser.read_rds`. - - Returns: - A ``GenomicRangesList`` object. - """ - - _cls = get_class(robj) - - if _cls not in ["CompressedGRangesList", "GRangesList"]: - raise TypeError(f"obj is not genomic ranges list, but is `{_cls}`.") - - _gre = as_granges(robj["attributes"]["unlistData"]) - - _groups = robj["attributes"]["partitioning"]["attributes"]["NAMES"]["data"] - _partitionends = robj["attributes"]["partitioning"]["attributes"]["end"]["data"] - - _grelist = [] - - current = 0 - for _pend in _partitionends: - _grelist.append(_gre[current:_pend]) - current = _pend - - return GenomicRangesList(ranges=_grelist, names=_groups) diff --git a/src/rds2py/interface.py b/src/rds2py/interface.py deleted file mode 100644 index 8f8ad93..0000000 --- a/src/rds2py/interface.py +++ /dev/null @@ -1,258 +0,0 @@ -from typing import Literal - -from numpy import ndarray -from singlecellexperiment import SingleCellExperiment -from summarizedexperiment import SummarizedExperiment -from biocframe import BiocFrame - -from .parser import get_class -from .pdf import as_pandas_from_data_frame, as_pandas_from_dframe - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def as_pandas(robj): - """Parse an R object as a :py:class:`~pandas.DataFrame`. - - Currently supports ``DFrame`` or ``data.frame`` class objects from R. - - Args: - robj: - Object parsed from the `RDS` file. - - Usually the result of :py:func:`~rds2py.parser.read_rds`. - - Returns: - A :py:class:`~pandas.DataFrame` containing the data from the R Object. - """ - _cls = get_class(robj) - - if _cls == "DFrame": - return as_pandas_from_dframe(robj) - elif _cls == "data.frame": - return as_pandas_from_data_frame(robj) - else: - raise TypeError( - f"`robj` must be either a 'DFrame' or 'data.frame' but is {_cls}." - ) - - -def as_sparse_matrix(robj): - """Parse an R object as a sparse matrix. - - Only supports reading of `dgCMatrix`, `dgRMatrix`, `dgTMatrix` marices. - - Args: - robj: - Object parsed from the `RDS` file. - - Usually the result of :py:func:`~rds2py.parser.read_rds`. - - Returns: - A sparse matrix of the R object. - """ - from scipy.sparse import csc_matrix, csr_matrix - - _cls = get_class(robj) - - if _cls not in ["dgCMatrix", "dgRMatrix", "dgTMatrix"]: - raise TypeError( - f"`robj` does not contain not a supported sparse matrix format, contains `{_cls}`." - ) - - if _cls == "dgCMatrix": - return csc_matrix( - ( - robj["attributes"]["x"]["data"], - robj["attributes"]["i"]["data"], - robj["attributes"]["p"]["data"], - ), - shape=tuple(robj["attributes"]["Dim"]["data"].tolist()), - ) - - if _cls == "dgRMatrix": - return csr_matrix( - ( - robj["attributes"]["x"]["data"], - robj["attributes"]["i"]["data"], - robj["attributes"]["p"]["data"], - ), - shape=tuple(robj["attributes"]["Dim"]["data"].tolist()), - ) - - if _cls == "dgTMatrix": - return csr_matrix( - ( - robj["attributes"]["x"]["data"], - ( - robj["attributes"]["i"]["data"], - robj["attributes"]["j"]["data"], - ), - ), - shape=tuple(robj["attributes"]["Dim"]["data"].tolist()), - ) - - -def as_dense_matrix(robj, order: Literal["C", "F"] = "F") -> ndarray: - """Parse an R object as a :py:class:`~numpy.ndarray`. - - Args: - robj: - Object parsed from the `RDS` file. - - Usually the result of :py:func:`~rds2py.parser.read_rds`. - - order: - Row-major (**C**-style) or Column-major (**F**ortran-style) - order. - - Defaults to "F". - - Returns: - An ``ndarray`` of the R object. - """ - _cls = get_class(robj) - - if order not in ["C", "F"]: - raise ValueError("order must be either 'C' or 'F'.") - - if _cls not in ["densematrix"]: - raise TypeError(f"obj is not a supported dense matrix format, but is `{_cls}`.") - - return ndarray( - shape=tuple(robj["attributes"]["dim"]["data"].tolist()), - dtype=robj["data"].dtype, - buffer=robj["data"], - order=order, - ) - - -def as_summarized_experiment(robj): - """Parse an R object as a :py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment` or - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. - - Note: This function demonstrates how to parse a complex RDS objects in Python and may not work for all - scenarios. - - Args: - robj: - Object parsed from the `RDS` file. - - Usually the result of :py:func:`~rds2py.parser.read_rds`. - - order: - Row-major (**C**-style) or Column-major (**F**ortran-style) - order. Only used if the ``robj`` contains a :py:class:`~numpy.ndarray`. - - Defaults to "F". - - Returns: - A `SummarizedExperiment` or - `SingleCellExperiment` from the R object. - """ - - _cls = get_class(robj) - - if _cls not in ["SingleCellExperiment", "SummarizedExperiment"]: - raise TypeError( - "`robj` does not contain a `SingleCellExperiment` or `SummarizedExperiment`." - ) - - # parse assays names - robj_asys = {} - assay_dims = None - asy_names = robj["attributes"]["assays"]["attributes"]["data"]["attributes"][ - "listData" - ]["attributes"]["names"]["data"] - for idx in range(len(asy_names)): - idx_asy = robj["attributes"]["assays"]["attributes"]["data"]["attributes"][ - "listData" - ]["data"][idx] - - asy_class = get_class(idx_asy) - - if asy_class in ["dgCMatrix", "dgRMatrix", "dgTMatrix"]: - robj_asys[asy_names[idx]] = as_sparse_matrix(idx_asy) - if assay_dims is None: - assay_dims = robj_asys[asy_names[idx]].shape - elif asy_class == "densematrix": - robj_asys[asy_names[idx]] = as_dense_matrix(idx_asy) - if assay_dims is None: - assay_dims = robj_asys[asy_names[idx]].shape - else: - robj_asys[asy_names[idx]] = None - - # parse coldata - robj_coldata = as_pandas_from_dframe(robj["attributes"]["colData"]) - if robj_coldata.empty: - robj_coldata = BiocFrame({"_cols": range(assay_dims[1])}) - - # parse rowRanges - robj_rowdata = None - if "rowRanges" in robj["attributes"]: - robj_rowdata = as_pandas_from_dframe( - robj["attributes"]["rowRanges"]["attributes"]["elementMetadata"] - ) - else: - robj_rowdata = BiocFrame({"_rows": range(assay_dims[0])}) - - # check red. dims, alternative expts - robj_reduced_dims = None - robj_altExps = None - if _cls == "SingleCellExperiment": - col_attrs = robj["attributes"]["int_colData"]["attributes"]["listData"][ - "attributes" - ]["names"]["data"] - - for idx in range(len(col_attrs)): - idx_col = col_attrs[idx] - idx_value = robj["attributes"]["int_colData"]["attributes"]["listData"][ - "data" - ][idx] - - if idx_col == "reducedDims" and idx_value["data"] is not None: - robj_reduced_dims = as_dense_matrix( - robj["attributes"]["int_colData"]["attributes"]["listData"]["data"] - ) - - if idx_col == "altExps": - alt_names = idx_value["attributes"]["listData"]["attributes"]["names"][ - "data" - ] - robj_altExps = {} - for idx_alt_names in range(len(alt_names)): - altn = alt_names[idx_alt_names] - - alt_key = list( - idx_value["attributes"]["listData"]["data"][idx_alt_names][ - "attributes" - ].keys() - )[0] - - robj_altExps[altn] = as_summarized_experiment( - idx_value["attributes"]["listData"]["data"][idx_alt_names][ - "attributes" - ][alt_key] - ) - - # ignore colpairs for now, does anyone even use this ? - # if col == "colPairs": - - if _cls == "SummarizedExperiment": - return SummarizedExperiment( - assays=robj_asys, row_data=robj_rowdata, column_data=robj_coldata - ) - elif _cls == "SingleCellExperiment": - return SingleCellExperiment( - assays=robj_asys, - row_data=robj_rowdata, - column_data=robj_coldata, - alternative_experiments=robj_altExps, - reduced_dims=robj_reduced_dims, - ) - else: - raise TypeError( - "`robj` is neither a `SummarizedExperiment` nor `SingleCellExperiment`." - ) diff --git a/src/rds2py/parser.py b/src/rds2py/parser.py deleted file mode 100644 index e8aad9c..0000000 --- a/src/rds2py/parser.py +++ /dev/null @@ -1,47 +0,0 @@ -from typing import Dict, MutableMapping - -from .PyRdsReader import PyRdsParser - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def read_rds(file: str) -> Dict: - """Read an RDS file as a :py:class:`~dict`. - - Args: - file (str): Path to RDS file. - - Returns: - MutableMapping: R object as a python dictionary. - """ - parsed_obj = PyRdsParser(file) - realized = parsed_obj.parse() - - return realized - - -def get_class(robj: MutableMapping) -> str: - """Generic method to get the class information of the R object. - - Args: - robj (MutableMapping): Object parsed from the `RDS` file. - Usually the result of :py:func:`~rds2py.parser.read_rds`. - - Returns: - str: Class name. - """ - if "class_name" in robj: - return robj["class_name"] - - if "attributes" in robj and len(robj["attributes"].keys()) > 0: - obj_attr = robj["attributes"] - if "class" in obj_attr: - return obj_attr["class"]["data"][0] - - # kind of making this assumption, if we ever see a dim, its a matrix - if "dim" in obj_attr: - return "densematrix" - - return None diff --git a/src/rds2py/pdf.py b/src/rds2py/pdf.py deleted file mode 100644 index a12f015..0000000 --- a/src/rds2py/pdf.py +++ /dev/null @@ -1,72 +0,0 @@ -from .parser import get_class - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def as_pandas_from_data_frame(robj): - """Read an R object to a :py:class:`~pandas.DataFrame`. - - Args: - robj: - Object parsed from the `RDS` file. - - Usually the result of :py:func:`~rds2py.parser.read_rds`. - - Returns: - A `DataFrame` from the R Object. - """ - from pandas import DataFrame - - cls = get_class(robj) - - if cls != "data.frame": - raise TypeError("`robj` does not contain a 'data.frame'.") - - df = DataFrame( - robj["data"], - columns=robj["attributes"]["names"]["data"], - index=robj["attributes"]["row.names"]["data"], - ) - - return df - - -def as_pandas_from_dframe(robj): - """Convert a realized R object to a pandas data frame representation. - - Args: - robj: - Object parsed from the `RDS` file. - - Usually the result of :py:func:`~rds2py.parser.read_rds`. - - Returns: - A `DataFrame` from the R Object. - """ - from pandas import DataFrame - - cls = get_class(robj) - - if cls != "DFrame": - raise Exception("`robj` does not contain a 'DFrame'.") - - data = {} - col_names = robj["attributes"]["listData"]["attributes"]["names"]["data"] - for idx in range(len(col_names)): - idx_asy = robj["attributes"]["listData"]["data"][idx] - - data[col_names[idx]] = idx_asy["data"] - - index = None - if robj["attributes"]["rownames"]["data"]: - index = robj["attributes"]["rownames"]["data"] - - df = DataFrame( - data, - columns=col_names, - index=index, - ) - - return df diff --git a/src/rds2py/rdsutils.py b/src/rds2py/rdsutils.py new file mode 100644 index 0000000..82f52a3 --- /dev/null +++ b/src/rds2py/rdsutils.py @@ -0,0 +1,67 @@ +"""Utility functions for RDS file parsing and class inference. + +This module provides helper functions for parsing RDS files and inferring the appropriate R class information from +parsed objects. +""" + +from .PyRdsReader import PyRdsParser + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def parse_rds(path: str) -> dict: + """Parse an RDS file into a dictionary representation. + + Args: + path: + Path to the RDS file to be parsed. + + Returns: + A dictionary containing the parsed contents of the RDS file. + The structure depends on the type of R object stored in the file. + """ + parsed_obj = PyRdsParser(path) + realized = parsed_obj.parse() + + return realized + + +def get_class(robj: dict) -> str: + """Infer the R class name from a parsed RDS object. + + Notes: + - Handles both S4 and non-S4 R objects + - Special handling for vectors and matrices + - Checks for class information in object attributes + + Args: + robj: + Dictionary containing parsed RDS data, typically + the output of :py:func:`~.parse_rds`. + + Returns: + The inferred R class name, or None if no class can be determined. + """ + _inferred_cls_name = None + if robj["type"] != "S4": + if "class_name" in robj: + _inferred_cls_name = robj["class_name"] + if _inferred_cls_name is not None and ( + "integer" in _inferred_cls_name or "double" in _inferred_cls_name or _inferred_cls_name == "vector" + ): + if "attributes" in robj: + obj_attr = robj["attributes"] + + # kind of making this assumption, if we ever see a dim, its a matrix + if obj_attr is not None: + if "dim" in obj_attr: + _inferred_cls_name = "ndarray" + elif "class" in obj_attr: + _inferred_cls_name = obj_attr["class"]["data"][0] + + else: + _inferred_cls_name = robj["class_name"] + + return _inferred_cls_name diff --git a/src/rds2py/read_atomic.py b/src/rds2py/read_atomic.py new file mode 100644 index 0000000..bd831c5 --- /dev/null +++ b/src/rds2py/read_atomic.py @@ -0,0 +1,115 @@ +"""Functions for parsing atomic R vector types into Python objects. + +This module provides parser functions for converting R's atomic vector types (boolean, integer, string, and double) into +appropriate Python objects using the biocutils package's specialized list classes. +""" + +from biocutils import BooleanList, FloatList, IntegerList, StringList + +from .generics import _dispatcher + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def _extract_names(robject: dict, **kwargs): + """Extract names attribute from an R object if present. + + Args: + robject: + Dictionary containing parsed R object data. + + **kwargs: + Additional arguments. + + Returns: + List of names if present in the object's attributes, + None otherwise. + """ + _names = None + if "attributes" in robject and robject["attributes"] is not None: + if "names" in robject["attributes"]: + _names = _dispatcher(robject["attributes"]["names"]) + + return _names + + +def read_boolean_vector(robject: dict, **kwargs) -> BooleanList: + """Convert an R boolean vector to a Python :py:class:`~biocutils.BooleanList`. + + Args: + robject: + Dictionary containing parsed R boolean vector data. + + **kwargs: + Additional arguments. + + Returns: + A `BooleanList` object containing the vector data + and any associated names. + """ + _names = _extract_names(robject, **kwargs) + + obj = BooleanList(robject["data"], names=_names) + return obj + + +def read_integer_vector(robject: dict, **kwargs) -> IntegerList: + """Convert an R integer vector to a Python :py:class:`~biocutils.IntegerList`. + + Args: + robject: + Dictionary containing parsed R integer vector data. + + **kwargs: + Additional arguments. + + Returns: + A `IntegerList` object containing the vector data + and any associated names. + """ + _names = _extract_names(robject, **kwargs) + + obj = IntegerList(robject["data"], names=_names) + return obj + + +def read_string_vector(robject: dict, **kwargs) -> StringList: + """Convert an R string vector to a Python :py:class:`~biocutils.StringList`. + + Args: + robject: + Dictionary containing parsed R string vector data. + + **kwargs: + Additional arguments. + + Returns: + A `StringList` object containing the vector data + and any associated names. + """ + _names = _extract_names(robject, **kwargs) + + obj = StringList(robject["data"], names=_names) + return obj + + +def read_double_vector(robject: dict, **kwargs) -> FloatList: + """Convert an R double vector to a Python :py:class:`~biocutils.FloatList`. + + Args: + robject: + Dictionary containing parsed R double vector data. + + **kwargs: + Additional arguments. + + Returns: + A `FloatList` object containing the vector data + and any associated names. + """ + _names = _extract_names(robject, **kwargs) + + obj = FloatList(robject["data"], names=_names) + return obj diff --git a/src/rds2py/read_dict.py b/src/rds2py/read_dict.py new file mode 100644 index 0000000..004ac43 --- /dev/null +++ b/src/rds2py/read_dict.py @@ -0,0 +1,49 @@ +"""Functions for parsing R vector and dictionary-like objects. + +This module provides functionality to convert R named vectors and list objects into Python dictionaries or lists, +maintaining the structure and names of the original R objects. +""" + +from .generics import _dispatcher +from .rdsutils import get_class + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def read_dict(robject: dict, **kwargs) -> dict: + """Convert an R named vector or list to a Python dictionary or list. + + Args: + robject: + Dictionary containing parsed R vector/list data. + + **kwargs: + Additional arguments. + + Returns: + If the R object has names, returns a dictionary mapping + names to values. Otherwise, returns a list of parsed values. + + Example: + >>> # For a named R vector c(a=1, b=2) + >>> result = read_dict(robject) + >>> print(result) + {'a': 1, 'b': 2} + """ + _cls = get_class(robject) + + if _cls not in ["vector"]: + raise RuntimeError(f"`robject` does not contain not a vector/dictionary object, contains `{_cls}`.") + + if "names" not in robject["attributes"]: + return [_dispatcher(x, **kwargs) for x in robject["data"]] + + dict_keys = list(_dispatcher(robject["attributes"]["names"], **kwargs)) + + final_vec = {} + for idx, dkey in enumerate(dict_keys): + final_vec[dkey] = _dispatcher(robject["data"][idx], **kwargs) + + return final_vec diff --git a/src/rds2py/read_factor.py b/src/rds2py/read_factor.py new file mode 100644 index 0000000..1339dba --- /dev/null +++ b/src/rds2py/read_factor.py @@ -0,0 +1,50 @@ +"""Functions for parsing R factor objects. + +This module handles the conversion of R factors (categorical variables) into Python lists, preserving the levels and +maintaining the order of the factor levels. +""" + +from .generics import _dispatcher +from .rdsutils import get_class + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def read_factor(robject: dict, **kwargs) -> list: + """Convert an R factor to a Python list. + + Args: + robject: + Dictionary containing parsed R factor data. + + **kwargs: + Additional arguments. + + Returns: + A list containing the factor values, with each value repeated + according to its length if specified. + """ + _cls = get_class(robject) + + if _cls not in ["factor"]: + raise RuntimeError(f"`robject` does not contain not a factor object, contains `{_cls}`.") + + data = robject["data"] + + levels = None + if "levels" in robject["attributes"]: + levels = _dispatcher(robject["attributes"]["levels"], **kwargs) + level_vec = [levels[x - 1] for x in data] + + if "lengths" in robject["attributes"]: + lengths = _dispatcher(robject["attributes"]["lengths"], **kwargs) + else: + lengths = [1] * len(data) + + final_vec = [] + for i, x in enumerate(lengths): + final_vec.extend([level_vec[i]] * x) + + return final_vec diff --git a/src/rds2py/read_frame.py b/src/rds2py/read_frame.py new file mode 100644 index 0000000..bc61922 --- /dev/null +++ b/src/rds2py/read_frame.py @@ -0,0 +1,92 @@ +"""Functions for parsing R data frame objects. + +This module provides parsers for converting both base R `data.frame` objects +and Bioconductor `DataFrame` objects into Python `BiocFrame` objects, preserving +row names, column names, and data types. +""" + +from biocframe import BiocFrame + +from .generics import _dispatcher +from .rdsutils import get_class + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def read_data_frame(robject: dict, **kwargs): + """Convert an R data.frame to a :py:class:`~biocframe.BiocFrame` object. + + Args: + robject: + Dictionary containing parsed R `data.frame` object. + + **kwargs: + Additional arguments. + + Returns: + A BiocFrame object containing the data frame's contents, + with preserved column and row names. + """ + cls = get_class(robject) + + if cls != "data.frame": + raise RuntimeError("`robject` does not contain a 'data.frame'.") + + col_names = _dispatcher(robject["attributes"]["names"], **kwargs) + + bframe_obj = {} + for idx, rd in enumerate(robject["data"]): + bframe_obj[col_names[idx]] = _dispatcher(rd, **kwargs) + + df = BiocFrame( + bframe_obj, + row_names=_dispatcher(robject["attributes"]["row.names"], **kwargs), + ) + + return df + + +def read_dframe(robject: dict, **kwargs): + """Convert an R DFrame (Bioconductor's `DataFrame`) to a `BiocFrame` object. + + Args: + robject: + Dictionary containing parsed R `DFrame` object. + + **kwargs: + Additional arguments. + + Returns: + A BiocFrame object containing the DataFrame's contents, + with preserved metadata and structure. + """ + from biocframe import BiocFrame + + cls = get_class(robject) + + if cls != "DFrame": + raise RuntimeError("`robject` does not contain a 'DFrame'.") + + data = {} + col_names = _dispatcher(robject["attributes"]["listData"]["attributes"]["names"], **kwargs) + for idx, colname in enumerate(col_names): + data[colname] = _dispatcher(robject["attributes"]["listData"]["data"][idx], **kwargs) + + index = None + if robject["attributes"]["rownames"]["data"]: + index = _dispatcher(robject["attributes"]["rownames"], **kwargs) + + nrows = None + if robject["attributes"]["nrows"]["data"]: + nrows = list(_dispatcher(robject["attributes"]["nrows"]), **kwargs)[0] + + df = BiocFrame( + data, + # column_names=col_names, + row_names=index, + number_of_rows=nrows, + ) + + return df diff --git a/src/rds2py/read_granges.py b/src/rds2py/read_granges.py new file mode 100644 index 0000000..5a39520 --- /dev/null +++ b/src/rds2py/read_granges.py @@ -0,0 +1,113 @@ +"""Functions for parsing Bioconductor GenomicRanges objects. + +This module provides parsers for converting Bioconductor's GenomicRanges and GenomicRangesList objects into their Python +equivalents, preserving all genomic coordinates and associated metadata. +""" + +from genomicranges import GenomicRanges, GenomicRangesList, SeqInfo +from iranges import IRanges + +from .generics import _dispatcher +from .rdsutils import get_class + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def read_genomic_ranges(robject: dict, **kwargs) -> GenomicRanges: + """Convert an R `GenomicRanges` object to a Python :py:class:`~genomicranges.GenomicRanges` object. + + Args: + robject: + Dictionary containing parsed `GenomicRanges` data. + + **kwargs: + Additional arguments. + + Returns: + A Python `GenomicRanges` object containing genomic intervals + with associated annotations. + """ + _cls = get_class(robject) + + if _cls not in ["GenomicRanges", "GRanges"]: + raise TypeError(f"obj is not 'GenomicRanges', but is `{_cls}`.") + + _range_start = _dispatcher(robject["attributes"]["ranges"]["attributes"]["start"], **kwargs) + _range_width = _dispatcher(robject["attributes"]["ranges"]["attributes"]["width"], **kwargs) + _range_names = None + if "NAMES" in robject["attributes"]["ranges"]["attributes"]: + _tmp_names = robject["attributes"]["ranges"]["attributes"]["NAMES"] + _range_names = _dispatcher(_tmp_names, **kwargs) + if _range_names is not None: + _range_names = list(_range_names) + + _ranges = IRanges(_range_start, _range_width, names=_range_names) + + _strands = _dispatcher(robject["attributes"]["strand"], **kwargs) + _seqnames = _dispatcher(robject["attributes"]["seqnames"], **kwargs) + _seqinfo_seqnames = _dispatcher(robject["attributes"]["seqinfo"]["attributes"]["seqnames"], **kwargs) + _seqinfo_seqlengths = _dispatcher(robject["attributes"]["seqinfo"]["attributes"]["seqlengths"], **kwargs) + _seqinfo_is_circular = _dispatcher(robject["attributes"]["seqinfo"]["attributes"]["is_circular"], **kwargs) + _seqinfo_genome = _dispatcher(robject["attributes"]["seqinfo"]["attributes"]["genome"], **kwargs) + _seqinfo = SeqInfo( + seqnames=_seqinfo_seqnames, + seqlengths=_seqinfo_seqlengths, + is_circular=_seqinfo_is_circular, + genome=_seqinfo_genome, + ) + _mcols = _dispatcher(robject["attributes"]["elementMetadata"], **kwargs) + + _gr_names = None + if "NAMES" in robject["attributes"]: + _tmp_names = robject["attributes"]["NAMES"] + _gr_names = None if _tmp_names is None else _dispatcher(_tmp_names, **kwargs) + + return GenomicRanges( + seqnames=_seqnames, + ranges=_ranges, + strand=_strands, + names=_gr_names, + mcols=_mcols, + seqinfo=_seqinfo, + ) + + +def read_granges_list(robject: dict, **kwargs) -> GenomicRangesList: + """Convert an R `GenomicRangesList` object to a Python :py:class:`~genomicranges.GenomicRangesList`. + + Args: + robject: + Dictionary containing parsed GenomicRangesList data. + + **kwargs: + Additional arguments. + + Returns: + A Python `GenomicRangesList` object containing containing multiple + `GenomicRanges` objects. + """ + + _cls = get_class(robject) + + if _cls not in ["CompressedGRangesList", "GRangesList"]: + raise TypeError(f"`robject` is not genomic ranges list, but is `{_cls}`.") + + _gre = _dispatcher(robject["attributes"]["unlistData"], **kwargs) + + _groups = None + if "NAMES" in robject["attributes"]["partitioning"]["attributes"]: + _tmp_names = robject["attributes"]["partitioning"]["attributes"]["NAMES"] + _groups = None if _tmp_names is None else _dispatcher(_tmp_names, **kwargs) + + _partitionends = _dispatcher(robject["attributes"]["partitioning"]["attributes"]["end"], **kwargs) + + _grelist = [] + + current = 0 + for _pend in _partitionends: + _grelist.append(_gre[current:_pend]) + current = _pend + + return GenomicRangesList(ranges=_grelist, names=_groups) diff --git a/src/rds2py/read_mae.py b/src/rds2py/read_mae.py new file mode 100644 index 0000000..98d0650 --- /dev/null +++ b/src/rds2py/read_mae.py @@ -0,0 +1,80 @@ +"""Functions for parsing Bioconductor MultiAssayExperiment objects. + +This module handles the conversion of Bioconductor's MultiAssayExperiment container format into its Python equivalent, +preserving the complex relationships between multiple experimental assays and sample metadata. +""" + +from multiassayexperiment import MultiAssayExperiment + +from .generics import _dispatcher +from .rdsutils import get_class +from .read_matrix import MatrixWrapper + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def _sanitize_expts(expts, **kwargs): + """Convert raw experiment objects into SummarizedExperiment format. + + Args: + expts: + Dictionary of experiment objects. + + Returns: + Dictionary of converted experiments, with matrix-like objects + wrapped in SummarizedExperiment containers. + """ + from biocframe import BiocFrame + from summarizedexperiment import SummarizedExperiment + + res = {} + for k, v in expts.items(): + if isinstance(v, MatrixWrapper): + res[k] = SummarizedExperiment( + assays={"matrix": v.matrix}, + row_data=BiocFrame(row_names=v.dimnames[0]), + column_data=BiocFrame(row_names=v.dimnames[1]), + ) + else: + res[k] = v + + return res + + +def read_multi_assay_experiment(robject: dict, **kwargs) -> MultiAssayExperiment: + """Convert an R `MultiAssayExperiment` to a Python :py:class:`~multiassayexperiment.MultiAssayExperiment` object. + + Args: + robject: + Dictionary containing parsed MultiAssayExperiment data. + + **kwargs: + Additional arguments. + + Returns: + A Python `MultiAssayExperiment` object containing + multiple experimental assays with associated metadata. + """ + + _cls = get_class(robject) + + if _cls not in ["MultiAssayExperiment"]: + raise RuntimeError(f"`robject` does not contain a 'MultiAssayExperiment' object, contains `{_cls}`.") + + # parse experiment names + _expt_obj = robject["attributes"]["ExperimentList"]["attributes"]["listData"] + robj_expts = _dispatcher(_expt_obj, **kwargs) + + # parse sample_map + robj_samplemap = _dispatcher(robject["attributes"]["sampleMap"], **kwargs) + + # parse coldata + robj_coldata = _dispatcher(robject["attributes"]["colData"], **kwargs) + + return MultiAssayExperiment( + experiments=_sanitize_expts(robj_expts), + sample_map=robj_samplemap, + column_data=robj_coldata, + ) diff --git a/src/rds2py/read_matrix.py b/src/rds2py/read_matrix.py new file mode 100644 index 0000000..79d6e5b --- /dev/null +++ b/src/rds2py/read_matrix.py @@ -0,0 +1,209 @@ +"""Functions and classes for parsing R matrix objects. + +This module provides functionality to convert R matrix objects (both dense and sparse) into their Python equivalents +using NumPy and SciPy sparse matrix formats. It handles various R matrix types including dgCMatrix, dgRMatrix, and +dgTMatrix. +""" + +from typing import Literal + +from numpy import ndarray +from scipy.sparse import csc_matrix, csr_matrix, spmatrix + +from .generics import _dispatcher +from .rdsutils import get_class + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +class MatrixWrapper: + """A simple wrapper class for matrices that preserves dimension names. + + This class bundles a matrix (dense or sparse) with its dimension names, + maintaining the R-style naming of rows and columns. + + Attributes: + matrix: + The underlying matrix object (numpy.ndarray or scipy.sparse matrix). + + dimnames: + A tuple of (row_names, column_names), each being a list of strings or None. + """ + + def __init__(self, matrix, dimnames=None) -> None: + self.matrix = matrix + self.dimnames = dimnames + + +def _as_sparse_matrix(robject: dict, **kwargs) -> spmatrix: + """Convert an R sparse matrix to a SciPy sparse matrix. + + Notes: + - Supports dgCMatrix (column-sparse) + - Supports dgRMatrix (row-sparse) + - Supports dgTMatrix (triplet format) + - Preserves dimension names if present + + Args: + robject: + Dictionary containing parsed R sparse matrix data. + + **kwargs: + Additional arguments. + + Returns: + A SciPy sparse matrix or wrapped matrix if dimension names exist. + """ + + _cls = get_class(robject) + + if _cls not in ["dgCMatrix", "dgRMatrix", "dgTMatrix"]: + raise RuntimeError(f"`robject` does not contain not a supported sparse matrix format, contains `{_cls}`.") + + if _cls == "dgCMatrix": + mat = csc_matrix( + ( + robject["attributes"]["x"]["data"], + robject["attributes"]["i"]["data"], + robject["attributes"]["p"]["data"], + ), + shape=tuple(robject["attributes"]["Dim"]["data"].tolist()), + ) + elif _cls == "dgRMatrix": + mat = csr_matrix( + ( + robject["attributes"]["x"]["data"], + robject["attributes"]["i"]["data"], + robject["attributes"]["p"]["data"], + ), + shape=tuple(robject["attributes"]["Dim"]["data"].tolist()), + ) + elif _cls == "dgTMatrix": + mat = csr_matrix( + ( + robject["attributes"]["x"]["data"], + ( + robject["attributes"]["i"]["data"], + robject["attributes"]["j"]["data"], + ), + ), + shape=tuple(robject["attributes"]["Dim"]["data"].tolist()), + ) + + names = None + if "dimnames" in robject["attributes"]: + names = _dispatcher(robject["attributes"]["dimnames"], **kwargs) + if names is not None and len(names) > 0: + return MatrixWrapper(mat, names) + + return mat + + +def _as_dense_matrix(robject, order: Literal["C", "F"] = "F", **kwargs) -> ndarray: + """Convert an R matrix to a `NumPy` array. + + Args: + robject: + Dictionary containing parsed R matrix data. + + order: + Memory layout for the array. + 'C' for row-major, 'F' for column-major (default). + + **kwargs: + Additional arguments. + + Returns: + A NumPy array or wrapped array if dimension names exist. + """ + _cls = get_class(robject) + + if order not in ["C", "F"]: + raise ValueError("order must be either 'C' or 'F'.") + + if _cls not in ["ndarray"]: + raise TypeError(f"obj is not a supported dense matrix format, but is `{_cls}`.") + + mat = ndarray( + shape=tuple(robject["attributes"]["dim"]["data"].tolist()), + dtype=robject["data"].dtype, + buffer=robject["data"], + order=order, + ) + + names = None + if "dimnames" in robject["attributes"]: + names = _dispatcher(robject["attributes"]["dimnames"], **kwargs) + if names is not None and len(names) > 0: + return MatrixWrapper(mat, names) + + return mat + + +def read_dgcmatrix(robject: dict, **kwargs) -> spmatrix: + """Parse an R dgCMatrix (sparse column matrix). + + Args: + robject: + Dictionary containing parsed dgCMatrix data. + + **kwargs: + Additional arguments. + + Returns: + Parsed sparse column matrix. + """ + return _as_sparse_matrix(robject, **kwargs) + + +def read_dgrmatrix(robject: dict, **kwargs) -> spmatrix: + """Parse an R dgRMatrix (sparse row matrix). + + Args: + robject: + Dictionary containing parsed dgRMatrix data. + + **kwargs: + Additional arguments. + + Returns: + Parsed sparse row matrix. + """ + return _as_sparse_matrix(robject, **kwargs) + + +def read_dgtmatrix(robject: dict, **kwargs) -> spmatrix: + """Parse an R dgTMatrix (sparse triplet matrix).. + + Args: + robject: + Dictionary containing parsed dgTMatrix data. + + **kwargs: + Additional arguments. + + Returns: + Parsed sparse matrix. + """ + return _as_sparse_matrix(robject, **kwargs) + + +def read_ndarray(robject: dict, order: Literal["C", "F"] = "F", **kwargs) -> ndarray: + """Parse an R matrix as a NumPy array. + + Args: + robject: + Dictionary containing parsed dgCMatrix data. + + order: + Memory layout for the array. + + **kwargs: + Additional arguments. + + Returns: + Parsed dense array. + """ + return _as_dense_matrix(robject, order=order, **kwargs) diff --git a/src/rds2py/read_rle.py b/src/rds2py/read_rle.py new file mode 100644 index 0000000..9a33716 --- /dev/null +++ b/src/rds2py/read_rle.py @@ -0,0 +1,50 @@ +"""Functions for parsing R's Rle (Run-length encoding) objects. + +This module provides functionality to convert R's Rle (Run-length encoding) objects into Python lists, expanding the +compressed representation into its full form. +""" + +from .generics import _dispatcher +from .rdsutils import get_class + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def read_rle(robject: dict, **kwargs) -> list: + """Convert an R Rle object to a Python list. + + Args: + robject: + Dictionary containing parsed Rle data. + + **kwargs: + Additional arguments. + + Returns: + Expanded list where each value is repeated according to its run length. + + Example: + >>> # For Rle with values=[1,2] and lengths=[3,2] + >>> result = read_rle(robject) + >>> print(result) + [1, 1, 1, 2, 2] + """ + _cls = get_class(robject) + + if _cls != "Rle": + raise RuntimeError(f"`robject` does not contain a 'Rle' object, contains `{_cls}`.") + + data = list(_dispatcher(robject["attributes"]["values"], **kwargs)) + + if "lengths" in robject["attributes"]: + lengths = _dispatcher(robject["attributes"]["lengths"], **kwargs) + else: + lengths = [1] * len(data) + + final_vec = [] + for i, x in enumerate(lengths): + final_vec.extend([data[i]] * x) + + return final_vec diff --git a/src/rds2py/read_sce.py b/src/rds2py/read_sce.py new file mode 100644 index 0000000..763cbe4 --- /dev/null +++ b/src/rds2py/read_sce.py @@ -0,0 +1,86 @@ +"""Functions for parsing Bioconductor `SingleCellExperiment` objects. + +This module provides parsers for converting Bioconductor's `SingleCellExperiment` +objects into their Python equivalents, handling the complex structure of single-cell +data including multiple assays, reduced dimensions, and alternative experiments. +""" + +from singlecellexperiment import SingleCellExperiment + +from .generics import _dispatcher +from .rdsutils import get_class + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def read_alts_summarized_experiment_by_column(robject: dict, **kwargs): + """Parse alternative experiments in a SingleCellExperiment.""" + _cls = get_class(robject) + + if _cls not in ["SummarizedExperimentByColumn"]: + raise RuntimeError(f"`robject` does not contain a 'SummarizedExperimentByColumn' object, contains `{_cls}`.") + + objs = {} + + for key, val in robject["attributes"].items(): + objs[key] = _dispatcher(val, **kwargs) + + return objs + + +def read_single_cell_experiment(robject: dict, **kwargs) -> SingleCellExperiment: + """Convert an R SingleCellExperiment to Python SingleCellExperiment. + + Args: + robject: + Dictionary containing parsed SingleCellExperiment data. + + **kwargs: + Additional arguments. + + Returns: + A Python SingleCellExperiment object containing + the assay data and associated metadata. + """ + + _cls = get_class(robject) + + if _cls not in ["SingleCellExperiment"]: + raise RuntimeError(f"`robject` does not contain a 'SingleCellExperiment' object, contains `{_cls}`.") + + robject["class_name"] = "RangedSummarizedExperiment" + _rse = _dispatcher(robject, **kwargs) + + # check red. dims, alternative expts + robj_reduced_dims = None + robj_alt_exps = None + col_attrs = list( + _dispatcher(robject["attributes"]["int_colData"]["attributes"]["listData"]["attributes"]["names"], **kwargs) + ) + + for idx in range(len(col_attrs)): + idx_col = col_attrs[idx] + idx_value = robject["attributes"]["int_colData"]["attributes"]["listData"]["data"][idx] + + if idx_col == "reducedDims" and idx_value.get("data", None) is not None: + robj_reduced_dims = _dispatcher(idx_value, **kwargs) + + if idx_col == "altExps": + alt_names = list(_dispatcher(idx_value["attributes"]["listData"]["attributes"]["names"], **kwargs)) + robj_alt_exps = {} + for idx, altn in enumerate(alt_names): + robj_alt_exps[altn] = _dispatcher(idx_value["attributes"]["listData"]["data"][idx], **kwargs)["se"] + + # ignore colpairs for now, does anyone even use this ? + # if col == "colPairs": + + return SingleCellExperiment( + assays=_rse.assays, + row_data=_rse.row_data, + column_data=_rse.column_data, + row_ranges=_rse.row_ranges, + alternative_experiments=robj_alt_exps, + reduced_dims=robj_reduced_dims, + ) diff --git a/src/rds2py/read_se.py b/src/rds2py/read_se.py new file mode 100644 index 0000000..8da0a09 --- /dev/null +++ b/src/rds2py/read_se.py @@ -0,0 +1,110 @@ +from summarizedexperiment import RangedSummarizedExperiment, SummarizedExperiment + +from .generics import _dispatcher +from .rdsutils import get_class +from .read_matrix import MatrixWrapper + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def _sanitize_empty_frame(frame, nrows): + if frame.shape == (0, 0): + from biocframe import BiocFrame + + return BiocFrame(number_of_rows=nrows) + + +def _sanitize_assays(assays): + res = {} + for k, v in assays.items(): + if isinstance(v, MatrixWrapper): + res[k] = v.matrix + else: + res[k] = v + + return res + + +def read_summarized_experiment(robject: dict, **kwargs) -> SummarizedExperiment: + """Convert an R SummarizedExperiment to Python + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. + + Args: + robject: + Dictionary containing parsed SummarizedExperiment data. + + **kwargs: + Additional arguments. + + Returns: + A `SummarizedExperiment` from the R object. + """ + + _cls = get_class(robject) + + if _cls not in ["SummarizedExperiment"]: + raise RuntimeError(f"`robject` does not contain a 'SummarizedExperiment' object, contains `{_cls}`.") + # parse assays names + robj_asys = {} + assay_dims = None + asy_names = list( + _dispatcher( + robject["attributes"]["assays"]["attributes"]["data"]["attributes"]["listData"]["attributes"]["names"], + **kwargs, + ) + ) + for idx, asyname in enumerate(asy_names): + idx_asy = robject["attributes"]["assays"]["attributes"]["data"]["attributes"]["listData"]["data"][idx] + + robj_asys[asyname] = _dispatcher(idx_asy, **kwargs) + if assay_dims is None: + assay_dims = robj_asys[asyname].shape + + # parse coldata + robj_coldata = _sanitize_empty_frame(_dispatcher(robject["attributes"]["colData"], **kwargs), assay_dims[1]) + + # parse rowdata + robj_rowdata = _sanitize_empty_frame(_dispatcher(robject["attributes"]["elementMetadata"], **kwargs), assay_dims[0]) + + return SummarizedExperiment( + assays=_sanitize_assays(robj_asys), + row_data=robj_rowdata, + column_data=robj_coldata, + ) + + +def read_ranged_summarized_experiment(robject: dict, **kwargs) -> RangedSummarizedExperiment: + """Convert an R RangedSummarizedExperiment to its Python equivalent. + + Args: + robject: + Dictionary containing parsed SummarizedExperiment data. + + **kwargs: + Additional arguments. + + Returns: + A Python RangedSummarizedExperiment object. + """ + + _cls = get_class(robject) + + if _cls not in ["RangedSummarizedExperiment"]: + raise RuntimeError(f"`robject` does not contain a 'RangedSummarizedExperiment' object, contains `{_cls}`.") + + robject["class_name"] = "SummarizedExperiment" + _se = _dispatcher(robject, **kwargs) + + # parse rowRanges + row_ranges_data = None + if "rowRanges" in robject["attributes"]: + row_ranges_data = _dispatcher(robject["attributes"]["rowRanges"], **kwargs) + + return RangedSummarizedExperiment( + assays=_se.assays, + row_data=_se.row_data, + column_data=_se.column_data, + row_ranges=row_ranges_data, + ) diff --git a/tests/data/.Rhistory b/tests/data/.Rhistory deleted file mode 100644 index 4e4f090..0000000 --- a/tests/data/.Rhistory +++ /dev/null @@ -1,512 +0,0 @@ -"', def: '", -defs[[i]] , -"'}) ", -"with o ", -"MATCH (p:OntoTerm { id: '", -multi_parents[j], -"'}) ", -"CREATE (p)<-[:parent]-(o) ", -"CREATE (p)-[:child]->(o) " -) -q_cypher <- gsub("\"", "", query_parents) -call_neo4j(q_cypher, con) -# quotes <- c(quotes, query_parents) -# call_neo4j(query_parents, con) -} -} -} -} -importOntology("~/Projects/work/scSearch/scripts/ontologies/v1/efo_v3.32.0.obo", "3.32", "EFO") -importOntology <- -function(onto_path, -version, -source, -neo4j_user = "neo4j", -neo4j_pass = "test") { -con <- neo4j_api$new(url = "http://localhost:7474", -user = neo4j_user, -password = neo4j_pass) -query_node <- -paste0( -"MERGE (o:OntoSource { source:'", source , "', version: '", -version, -"'})" -) -q_cypher <- gsub("\"", "", query_node) -# quotes <- c(quotes, query_node) -call_neo4j(q_cypher, con) -out <- -ontologyIndex::get_ontology(onto_path, extract_tags = "everything") -ids <- out$id -names <- out$name -parents <- out$parents -ancestors <- out$ancestors -is_as <- out$is_a -defs <- out$def -nss <- out$namespace -quotes <- c() -for (i in 1:length(ids)) { -query_node <- -paste0( -"MERGE (os:OntoSource { source:'", source , "', version: '", -version, -"'}) ", -"MERGE (o:OntoTerm { source:'", source , "', id: '", -ids[[i]] , -"', name: '", -names[[i]], -"', def: '", -defs[[i]] , -"'})", -"with os, o ", -"CREATE (o)-[:source]->(os) " -) -q_cypher <- gsub("\"", "", query_node) -# quotes <- c(quotes, query_node) -call_neo4j(q_cypher, con) -if (length(unname(unlist(nss[i]))) > 0) { -query_namespace <- -paste0( -"MERGE (o:OntoTerm { id: '", -ids[[i]] , -"', name: '", -names[[i]], -"', def: '", -defs[[i]] , -"'}) ", -"MERGE (n:OntoNamespace { id: '", -unlist(unname(nss[i])), -"'}) ", -"with o, n ", -"CREATE (o)-[:namespace]->(n) " -) -q_cypher <- gsub("\"", "", query_namespace) -call_neo4j(q_cypher, con) -# quotes <- c(quotes, query_namespace) -# call_neo4j(query_namespace, con) -} -} -for (i in 1:length(ids)) { -if (length(unname(unlist(parents[i]))) > 0) { -multi_parents <- strsplit(unname(unlist(parents[i])), ";") -for (j in 1:length(multi_parents)) { -query_parents = paste0( -"MERGE (o:OntoTerm { id: '", -ids[[i]] , -"', name: '", -names[[i]], -"', def: '", -defs[[i]] , -"'}) ", -"with o ", -"MATCH (p:OntoTerm { id: '", -multi_parents[j], -"'}) ", -"CREATE (p)<-[:parent]-(o) ", -"CREATE (p)-[:child]->(o) " -) -q_cypher <- gsub("\"", "", query_parents) -call_neo4j(q_cypher, con) -# quotes <- c(quotes, query_parents) -# call_neo4j(query_parents, con) -} -} -} -} -importOntology("~/Projects/work/scSearch/scripts/ontologies/v1/efo_v3.32.0.obo", "3.32", "EFO") -out <- -ontologyIndex::get_ontology("~/Projects/work/scSearch/scripts/ontologies/v1/uberon_7_27_2021.obo", extract_tags = "everything") -ids <- out$id -names <- out$name -parents <- out$parents -ancestors <- out$ancestors -is_as <- out$is_a -defs <- out$def -nss <- out$namespace -i < 1 -i <- 1 -query_node <- -paste0( -"MATCH (os:OntoSource { source:'", source , "', version: '", -version, -"'}) ", -"MERGE (o:OntoTerm { source:'", source , "', id: '", -ids[[i]] , -"', name: '", -names[[i]], -"', def: '", -defs[[i]] , -"'})", -"with os, o ", -"CREATE (o)-[:source]->(os) " -) -q_cypher <- gsub("\"", "", query_node) -source <- "UBERON" -version <- "TEST" -query_node <- -paste0( -"MATCH (os:OntoSource { source:'", source , "', version: '", -version, -"'}) ", -"MERGE (o:OntoTerm { source:'", source , "', id: '", -ids[[i]] , -"', name: '", -names[[i]], -"', def: '", -defs[[i]] , -"'})", -"with os, o ", -"CREATE (o)-[:source]->(os) " -) -q_cypher <- gsub("\"", "", query_node) -print(q_cypher) -library(neo4r) -library(ontologyIndex) -importOntology <- -function(onto_path, -version, -source, -neo4j_user = "neo4j", -neo4j_pass = "test") { -con <- neo4j_api$new(url = "http://localhost:7474", -user = neo4j_user, -password = neo4j_pass) -query_node <- -paste0( -"MERGE (o:OntoSource { source:'", source , "', version: '", -version, -"'})" -) -q_cypher <- gsub("\"", "", query_node) -# print(q_cypher) -call_neo4j(q_cypher, con) -out <- -ontologyIndex::get_ontology(onto_path, extract_tags = "everything") -ids <- out$id -names <- out$name -parents <- out$parents -ancestors <- out$ancestors -is_as <- out$is_a -defs <- out$def -nss <- out$namespace -for (i in 1:length(ids)) { -query_node <- -paste0( -"MATCH (os:OntoSource { source:'", source , "', version: '", -version, -"'}) ", -"MERGE (o:OntoTerm { source:'", source , "', id: '", -ids[[i]] , -"', name: '", -names[[i]], -"', def: '", -defs[[i]] , -"'})", -"with os, o ", -"CREATE (o)-[:source]->(os) " -) -q_cypher <- gsub("\"", "", query_node) -# print(q_cypher) -call_neo4j(q_cypher, con) -if (length(unname(unlist(nss[i]))) > 0) { -query_namespace <- -paste0( -"MERGE (o:OntoTerm { id: '", -ids[[i]] , -"', name: '", -names[[i]], -"', def: '", -defs[[i]] , -"'}) ", -"MERGE (n:OntoNamespace { id: '", -unlist(unname(nss[i])), -"'}) ", -"with o, n ", -"CREATE (o)-[:namespace]->(n) " -) -q_cypher <- gsub("\"", "", query_namespace) -# print(q_cypher) -call_neo4j(q_cypher, con) -} -} -for (i in 1:length(ids)) { -if (length(unname(unlist(parents[i]))) > 0) { -multi_parents <- strsplit(unname(unlist(parents[i])), ";") -for (j in 1:length(multi_parents)) { -query_parents = paste0( -"MERGE (o:OntoTerm { id: '", -ids[[i]] , -"', name: '", -names[[i]], -"', def: '", -defs[[i]] , -"'}) ", -"with o ", -"MATCH (p:OntoTerm { id: '", -multi_parents[j], -"'}) ", -"CREATE (p)<-[:parent]-(o) ", -"CREATE (p)-[:child]->(o) " -) -q_cypher <- gsub("\"", "", query_parents) -# print(q_cypher) -call_neo4j(q_cypher, con) -} -} -} -} -# UBERON -importOntology("~/Projects/work/scSearch/scripts/ontologies/v1/uberon_7_27_2021.obo", "7_27_2021", "UBERON") -importOntology("~/Projects/work/scSearch/scripts/ontologies/v1/uberon_7_27_2021.obo", "7_27_2021", "UBERON") -m <- sparseMatrix( -i = sample(x = 1e4, size = 1e4), -j = sample(x = 1e4, size = 1e4), -x = rnorm(n = 1e4) -) -mm -??sample -library(Matrix) -m <- sparseMatrix( -i = sample(x = 1e4, size = 1e4), -j = sample(x = 1e4, size = 1e4), -x = rnorm(n = 1e4) -) -m -??sparseMatrix -?sparseMatrix -install.packages("viewpoly") -viewpoly::run_app() -viewpoly::run_app() -viewpoly::run_app() -viewpoly::run_app() -suppressPackageStartupMessages(library(scater)) -suppressPackageStartupMessages(library(zellkonverter)) -set.seed(1000) -sce <- mockSCE() -dim(sce) -#> [1] 2000 200 -# Will use `Treatment` as a fake batch variable. -table(sce$Treatment) -writeH5AD(sce, file = "mockSCE.h5ad") -BiocManager::install("HCAData") -library("HCAData") -HCAData() -HCAData() -library("HCAData") -HCAData() -HCAData("ica_bone_marrow") -library(dsassembly) -restUrl("https://dev.cerberus.genomics.roche.com/v2") -ds <- getDataset("DS000020088") -dsassembly::activeUserCache(()) -dsassembly::activeUserCache() -dsassembly::userCache() -library(dsassembly) -library(dsassembly) -restUrl("https://dev.cerberus.genomics.roche.com/v2") -ds <- getDataset("DS000020088") -library(dsassembly) -library(zellkonverter) -library(jsonvalidate) -sce <- zellkonverter::readH5AD("~/Projects/GSM3138367.h5ad") -sce <- annotateExperiment(sce, -title="test_sce", -description="tst_sce_description", -annotation=NULL, -sources=list( -list(name="GEO", id="asbc7") -), -organism="Mus musculus", -namespace=list( -list(type="genome", id="GRCm38") -), -technology=list(name="scRNA-seq",details="10X Genomics") -) -row.names(sce) <- make.unique(row.names(sce)) -library(MultiAssayExperiment) -mae <- MultiAssayExperiment(experiments=list("experiment-1" = sce)) -mae <- annotateDataset(mae, -title="test", -description="test_desc", -authors=c("kancherj")) -library(dsdb.plus) -dsassembly::saveDataset(mae, dir="./Projects/work/test-datasets-upload/", stage.only=TRUE) -ds <- getDataset("DS000020088") -traceback() -library(dsassembly) -restUrl("https://dev.cerberus.genomics.roche.com/v2") -ds <- getDataset("DS000020102") -ds <- getDataset("DS000000267") -ds -rowRanges((ds)) -rowRanges(ds) -rowRanges(rowdata(ds)() -rowRanges(rowdata(ds)) -rowRanges(rowdata(ds)) -rowRanges(rowData(ds)) -experiments(ds) -experiments(ds)[["RNA-Seq_hsa_gene"]] -rowRanges(experiments(ds)[["RNA-Seq_hsa_gene"]]) -as(rowRanges(experiments(ds)[["RNA-Seq_hsa_gene"]]), "data.frame") -View(as(rowRanges(experiments(ds)[["RNA-Seq_hsa_gene"]]), "data.frame")) -View(as(rowRanges(experiments(getDataset("DS000000267"))[["RNA-Seq_hsa_gene"]]), "data.frame")) -library(dsassembly) -restUrl("https://dev.cerberus.genomics.roche.com/v2") -ds <- getDataset("DS000020088") -ds <- getDataset("DS000000264") -library(dsassembly) -restUrl("https://dev.cerberus.genomics.roche.com/v2") -ds <- getDataset("DS000020104") -library(genomitory) -hits <- searchFiles(type="collection", n=Inf) -# Ignore the FeatureDB legacy feature sets. -hits <- hits[hits$project != "GMTY28",] -descriptions <- list() -collections <- list() -all.genes <- all.ids <- integer(0) -counter <- 0L -for (h in seq_len(nrow(hits))) { -cursets <- getFeatureSetCollection(hits$id[h]) -descriptions[[h]] <- mcols(cursets) -descriptions[[h]]$size <- lengths(cursets) -collections[[h]] <- data.frame(id = hits$id[h], number = length(cursets), title=hits$title[h], description=hits$description[h], species=hits$organism[h]) -all.genes <- c(all.genes, unlist(cursets, use.names=FALSE)) -all.ids <- c(all.ids, rep(seq_along(cursets) + counter, lengths(cursets))) -counter <- counter + length(cursets) -} -descriptions <- do.call(rbind, descriptions) -collections <- do.call(rbind, collections) -u.genes <- unique(all.genes) -all.genes <- match(all.genes, u.genes) -by.gene <- split(all.ids, factor(all.genes, seq_along(u.genes))) -by.set <- split(all.genes, factor(all.ids, seq_len(nrow(descriptions)))) -gathered <- list( -list( -id = "GMTY17:GRCm38/GRCm38.IGIS4.0.genes.rds@REVISION-3", -field = "symbol" -), -list( -id = "GMTY17:GRCh38/GRCh38.IGIS4.0.genes.rds@REVISION-3", -field = "symbol" -) -) -found.genes <- found.symbols <- character(0) -for (x in gathered) { -current <- getFeatures(x$id) -found.genes <- c(found.genes, names(current)) -found.symbols <- c(found.symbols, mcols(current)[[x$field]]) -} -keep <- found.genes %in% u.genes & !is.na(found.symbols) -found.symbols <- c(u.genes, found.symbols[keep]) # get the Ensembl ID at the front. -found.genes <- c(u.genes, found.genes[keep]) -symbol.mapping <- split(found.symbols, factor(found.genes, levels=u.genes)) -dir <- "assets" -dir.create(dir) -saveTabbedIndices <- function(y, path) { -x <- vapply(y, function(z) { -z <- sort(z) # convert to diffs to reduce integer size -z <- c(z[1] - 1L, diff(z)) # get to 0-based indexing. -paste(z, collapse="\t") -}, "") -write(x, file=file.path(dir, path)) -handle <- gzfile(file.path(dir, paste0(path, ".ranges.gz"))) -write(nchar(x), file=handle, ncolumns=1) -close(handle) -} -saveTabbedIndices(by.gene, path="gene2set.tsv") -saveTabbedIndices(by.set, path="set2gene.tsv") -collected <- sprintf("%s\t%s\t%s\t%s\t%s", collections$id, collections$number, tolower(collections$title), gsub("\t|\n", " ", tolower(collections$description)), collections$species) -handle <- gzfile(file.path(dir, "collections.tsv.gz")) -write(collected, file=handle) -close(handle) -collected <- sprintf("%s\t%s\t%s", gsub("\t|\n", " ", tolower(descriptions$name)), gsub("\t|\n", " ", tolower(descriptions$description)), descriptions$size) -handle <- gzfile(file.path(dir, "sets.tsv.gz")) -write(collected, file=handle) -close(handle) -collected <- vapply(symbol.mapping, function(x) paste(gsub("\t|\n", " ", tolower(x)), collapse="\t"), "") -handle <- gzfile(file.path(dir, "genes.tsv.gz")) -write(collected, file=handle) -close(handle) -library(dsassembly) -dataset <- "DS000012156" -ds <- getDataset(dataset) -ds <- getDataset(dataset) -library(GenomeInfoDbData) -install.packages("BiocManager") -BiocManager::install("GenomeInfoDbData") -BiocManager::install("GenomeInfoDbData", version = 1.2.7) -BiocManager::install("GenomeInfoDbData", version = "1.2.7") -setwd("~/Projects/public/BiocPy/rds2py/tests/data") -setClass("FOO", slots=c(bar="integer")) -# pairlist -y <- pairlist(runif(10), runif(20), runif(30)) -saveRDS(y, file="pairlist.rds") -y <- pairlist(sample(letters), pairlist(sample(11), runif(12))) -saveRDS(y, file="pairlist_nested.rds") -y <- pairlist(foo=sample(letters), bar=pairlist(whee=sample(11), bum=runif(12))) # with names -saveRDS(y, file="pairlist_names.rds") -y <- pairlist(aaron=sample(letters), bar=list(sample(11), runif(12))) -attr(y, "foo") <- "bar" -saveRDS(y, file="pairlist_attr.rds") -# altrep -scenarios <- 1:15 -saveRDS(y, file="altrep_series.rds") -x <- 1:100 -names(x) <- sprintf("GENE_%s", seq_along(x)) -saveRDS(x, file="altrep_attr.rds") -x <- as.character(1:100) -saveRDS(x, file="altrep_strings_deferred.rds") -x <- c(NA_integer_, 1:10, NA_integer_) -x <- as.character(x) -saveRDS(x, file="altrep_strings_wNA.rds") -x <- as.character(1:100 * 2) -saveRDS(x, file="altrep_double_deferred.rds") -x <- c(NaN, 1:10, Inf, -Inf, NA) -x <- as.character(x) -saveRDS(x, file="altrep_double_wNA.rds") -# atomic -y <- rpois(112, lambda=8) -saveRDS(y, file="atomic_ints.rds") -y <- rbinom(55, 1, 0.5) == 0 -saveRDS(y, file="atomic_logical.rds") -y <- rbinom(999, 1, 0.5) == 0 -y[sample(length(y), 10)] <- NA -saveRDS(y, file="atomic_logical_wNA.rds") -y <- rnorm(99) -saveRDS(y, file="atomic_double.rds") -y <- as.raw(sample(256, 99, replace=TRUE) - 1) -saveRDS(y, file="atomic_raw.rds") -y <- rnorm(99) + rnorm(99) * 1i -saveRDS(y, file="atomic_complex.rds") -y <- sample(LETTERS) -saveRDS(y, file="atomic_chars.rds") -y <- c("α-globin", "😀😀😀", "fußball", "Hervé Pagès") -saveRDS(y, file="atomic_chars_unicode.rds") -vals <- sample(.Machine$integer.max, 1000) -names(vals) <- sprintf("GENE_%i", seq_along(vals)) -attr(vals, "foo") <- c("BAR", "bar", "Bar") -class(vals) <- "frog" -saveRDS(vals, file="atomic_attr.rds") -# lists -y <- list(runif(10), runif(20), runif(30)) -saveRDS(y, file="lists.rds") -y <- list(sample(letters), list(sample(11), runif(12))) -saveRDS(y, file="lists_nested.rds") -y <- list(list(2, 6), list(5, c("cat", "dog", "bouse"), list(sample(99), runif(20)))) -saveRDS(y, file="lists_nested_deep.rds") -df <- data.frame(xxx=runif(19), YYY=sample(letters, 19), ZZZ=rbinom(19, 1, 0.4) == 0) -saveRDS(df, file="lists_df.rds") -rownames(df) <- paste0("FOO-", LETTERS[1:19]) -saveRDS(df, file="lists_df_rownames.rds") -# S4 -y <- Matrix::rsparsematrix(100, 10, 0.05) -saveRDS(y, file="s4_matrix.rds") -setClass("FOO", slots=c(bar="integer")) -y <- new("FOO", bar=2L) -saveRDS(y, file="s4_class.rds") -?.row_data_path -??.row_data_path -showMethods(.row_data_path) -methods(".row_data_path") -methods(print) -methods(.row_data_path) diff --git a/tests/data/atomic_ints_with_names.rds b/tests/data/atomic_ints_with_names.rds new file mode 100644 index 0000000..de8b7a3 Binary files /dev/null and b/tests/data/atomic_ints_with_names.rds differ diff --git a/tests/data/data.frame.rds b/tests/data/data.frame.rds new file mode 100644 index 0000000..6649cf2 Binary files /dev/null and b/tests/data/data.frame.rds differ diff --git a/tests/data/generate_files.R b/tests/data/generate_files.R index 0aae82b..31166ce 100644 --- a/tests/data/generate_files.R +++ b/tests/data/generate_files.R @@ -70,6 +70,11 @@ attr(vals, "foo") <- c("BAR", "bar", "Bar") class(vals) <- "frog" saveRDS(vals, file="atomic_attr.rds") +# scalars + +y <- 10 +saveRDS(y, file="scalar_int.rds") + # lists y <- list(runif(10), runif(20), runif(30)) @@ -106,3 +111,69 @@ gr <- GRanges( GC = seq(1, 0, length=10)) saveRDS(gr, file="granges.rds") + +# factors + +f1 <- factor(c("chr1", "chr2", "chr1", "chr3")) +saveRDS(f1, "simple_factors.rds") + +# Rle +x2 <- Rle(LETTERS[c(21:26, 25:26)], 8:1) +saveRDS(x2, "simple_rle.rds") + + +# SummarizedExperiment + +nrows <- 200 +ncols <- 6 +counts <- matrix(runif(nrows * ncols, 1, 1e4), nrows) +rowRanges <- GRanges(rep(c("chr1", "chr2"), c(50, 150)), + IRanges(floor(runif(200, 1e5, 1e6)), width=100), + strand=sample(c("+", "-"), 200, TRUE), + feature_id=sprintf("ID%03d", 1:200)) +rowd <- DataFrame(seqs = rep(c("chr1", "chr2"), c(50, 150))) +colData <- DataFrame(Treatment=rep(c("ChIP", "Input"), 3), + row.names=LETTERS[1:6]) + +se <- SummarizedExperiment(assays=list(counts=counts), + rowData = rowd, colData=colData) + +rse <- SummarizedExperiment(assays=list(counts=counts), + rowRanges = rowRanges, colData=colData) +saveRDS(se, "sumexpt.rds") +saveRDS(rse, "ranged_se.rds") + +# SingleCell Experiment + +library(scRNAseq) +sce <- ReprocessedAllenData("tophat_counts") +sce_subset <- sce[1:100, 1:100] +saveRDS(sce_subset, "simple_sce.rds") + +# lists + +x <- list(github = "jkanche", fullname=c("Kancherla", "Jayaram"), + collab=list(github = "ltla", fullname=c("Lun", "Aaron"))) +saveRDS(x, "simple_list.rds") + +# frames +dframe <- as.data.frame(lists_df) +saveRDS(dframe, "data.frame.rds") + +# MAE +library(MultiAssayExperiment) +patient.data <- data.frame(sex=c("M", "F", "M", "F"), + age=38:41, + row.names=c("Jack", "Jill", "Bob", "Barbara")) + +exprss1 <- matrix(rnorm(16), ncol = 4, + dimnames = list(sprintf("ENST00000%i", sample(288754:290000, 4)), + c("Jack", "Jill", "Bob", "Bobby"))) +exprss2 <- matrix(rnorm(12), ncol = 3, + dimnames = list(sprintf("ENST00000%i", sample(288754:290000, 4)), + c("Jack", "Jane", "Bob"))) +doubleExp <- list("methyl 2k" = exprss1, "methyl 3k" = exprss2) +simpleMultiAssay <- MultiAssayExperiment(experiments=doubleExp) +simpleMultiAssay2 <- MultiAssayExperiment(experiments=doubleExp, + colData=patient.data) +saveRDS(simpleMultiAssay2, "simple_mae.rds") diff --git a/tests/data/ranged_se.rds b/tests/data/ranged_se.rds new file mode 100644 index 0000000..badac61 Binary files /dev/null and b/tests/data/ranged_se.rds differ diff --git a/tests/data/scalar_int.rds b/tests/data/scalar_int.rds new file mode 100644 index 0000000..ad5b757 Binary files /dev/null and b/tests/data/scalar_int.rds differ diff --git a/tests/data/simple_factors.rds b/tests/data/simple_factors.rds new file mode 100644 index 0000000..99b00a8 Binary files /dev/null and b/tests/data/simple_factors.rds differ diff --git a/tests/data/simple_list.rds b/tests/data/simple_list.rds new file mode 100644 index 0000000..50771a1 Binary files /dev/null and b/tests/data/simple_list.rds differ diff --git a/tests/data/simple_mae.rds b/tests/data/simple_mae.rds new file mode 100644 index 0000000..8c9b0ec Binary files /dev/null and b/tests/data/simple_mae.rds differ diff --git a/tests/data/simple_rle.rds b/tests/data/simple_rle.rds new file mode 100644 index 0000000..b5fe2d7 Binary files /dev/null and b/tests/data/simple_rle.rds differ diff --git a/tests/data/simple_sce.rds b/tests/data/simple_sce.rds new file mode 100644 index 0000000..e6a015f Binary files /dev/null and b/tests/data/simple_sce.rds differ diff --git a/tests/data/sumexpt.rds b/tests/data/sumexpt.rds new file mode 100644 index 0000000..69f127d Binary files /dev/null and b/tests/data/sumexpt.rds differ diff --git a/tests/test_atomic-attr.py b/tests/test_atomic-attr.py deleted file mode 100644 index 95bb708..0000000 --- a/tests/test_atomic-attr.py +++ /dev/null @@ -1,18 +0,0 @@ -import pytest - -from rds2py.PyRdsReader import PyRdsParser - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def test_read_atomic_attrs(): - parsed_obj = PyRdsParser("tests/data/atomic_attr.rds") - data = parsed_obj.parse() - print(data) - - assert data is not None - assert len(data["data"]) > 0 - assert len(data["attributes"]) > 0 - assert len(data["attributes"]["names"]["data"]) == 1000 diff --git a/tests/test_atomic-bool.py b/tests/test_atomic-bool.py deleted file mode 100644 index e57fe27..0000000 --- a/tests/test_atomic-bool.py +++ /dev/null @@ -1,23 +0,0 @@ -import pytest - -from rds2py.PyRdsReader import PyRdsParser - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def test_read_atomic_logical(): - parsed_obj = PyRdsParser("tests/data/atomic_logical.rds") - array = parsed_obj.parse() - - assert array is not None - assert array["data"].shape[0] > 0 - - -def test_read_atomic_logical_na(): - parsed_obj = PyRdsParser("tests/data/atomic_logical_wNA.rds") - array = parsed_obj.parse() - - assert array is not None - assert array["data"].shape[0] > 0 diff --git a/tests/test_atomic-double.py b/tests/test_atomic-double.py deleted file mode 100644 index f92f620..0000000 --- a/tests/test_atomic-double.py +++ /dev/null @@ -1,16 +0,0 @@ -import pytest - -from rds2py.PyRdsReader import PyRdsParser - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def test_read_atomic_double(): - parsed_obj = PyRdsParser("tests/data/atomic_double.rds") - array = parsed_obj.parse() - - assert array is not None - print(array) - assert array["data"].shape[0] == 99 diff --git a/tests/test_atomic-int.py b/tests/test_atomic-int.py deleted file mode 100644 index c10c7a6..0000000 --- a/tests/test_atomic-int.py +++ /dev/null @@ -1,16 +0,0 @@ -import pytest - -from rds2py.PyRdsReader import PyRdsParser - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def test_read_atomic_ints(): - parsed_obj = PyRdsParser("tests/data/atomic_ints.rds") - array = parsed_obj.parse() - - assert array is not None - print(array) - assert array["data"].shape[0] == 112 diff --git a/tests/test_atomic-str.py b/tests/test_atomic-str.py deleted file mode 100644 index 0b4d062..0000000 --- a/tests/test_atomic-str.py +++ /dev/null @@ -1,23 +0,0 @@ -import pytest - -from rds2py.PyRdsReader import PyRdsParser - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def test_read_atomic_chars(): - parsed_obj = PyRdsParser("tests/data/atomic_chars.rds") - array = parsed_obj.parse() - - assert array is not None - assert len(array["data"]) == 26 - - -def test_read_atomic_chars_unicode(): - parsed_obj = PyRdsParser("tests/data/atomic_chars_unicode.rds") - array = parsed_obj.parse() - - assert array is not None - assert len(array["data"]) == 4 diff --git a/tests/test_atomics.py b/tests/test_atomics.py new file mode 100644 index 0000000..2c4dafa --- /dev/null +++ b/tests/test_atomics.py @@ -0,0 +1,103 @@ +import pytest + +from rds2py import read_rds + +from biocutils import BooleanList, FloatList, IntegerList, StringList + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + +## With attributes + + +def test_read_atomic_attrs(): + data = read_rds("tests/data/atomic_attr.rds") + + assert data is not None + assert isinstance(data, dict) + assert data["attributes"]["class"]["data"][0] == "frog" + + +## Booleans + + +def test_read_atomic_logical(): + arr = read_rds("tests/data/atomic_logical.rds") + + assert arr is not None + assert isinstance(arr, BooleanList) + assert len(arr) > 0 + + +def test_read_atomic_logical_na(): + arr = read_rds("tests/data/atomic_logical_wNA.rds") + + assert arr is not None + assert isinstance(arr, BooleanList) + assert len(arr) > 0 + + +## Doubles/Floats + + +def test_read_atomic_double(): + obj = read_rds("tests/data/atomic_double.rds") + + assert obj is not None + assert isinstance(obj, FloatList) + assert len(obj) == 99 + + +## Ints + + +def test_read_atomic_ints(): + arr = read_rds("tests/data/atomic_ints.rds") + + assert arr is not None + assert isinstance(arr, IntegerList) + assert len(arr) == 112 + assert arr.names is None + + +def test_read_atomic_ints_with_names(): + arr = read_rds("tests/data/atomic_ints_with_names.rds") + + assert arr is not None + assert isinstance(arr, IntegerList) + assert arr.names is not None + assert len(arr) == 112 + + +## Strings + + +def test_read_atomic_chars(): + arr = read_rds("tests/data/atomic_chars.rds") + + assert arr is not None + assert isinstance(arr, StringList) + assert len(arr) == 26 + assert arr.names is None + + +def test_read_atomic_chars_unicode(): + arr = read_rds("tests/data/atomic_chars_unicode.rds") + + assert arr is not None + assert isinstance(arr, StringList) + assert len(arr) == 4 + assert arr.names is None + + +## Test scalar values, defaults to a vector + + +def test_read_scalar_float(): + obj = read_rds("tests/data/scalar_int.rds") + + assert obj is not None + assert isinstance(obj, FloatList) + assert len(obj) == 1 + assert obj[0] == 10.0 diff --git a/tests/test_dict.py b/tests/test_dict.py new file mode 100644 index 0000000..52611f3 --- /dev/null +++ b/tests/test_dict.py @@ -0,0 +1,52 @@ +import pytest + +from rds2py import read_rds + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_read_simple_lists(): + obj = read_rds("tests/data/simple_list.rds") + + assert obj is not None + assert len(obj) > 0 + + assert "collab" in obj + assert len(obj["collab"]) > 0 + + +def test_read_atomic_lists(): + obj = read_rds("tests/data/lists.rds") + + assert obj is not None + assert len(obj) > 0 + + +def test_read_atomic_lists_nested(): + obj = read_rds("tests/data/lists_nested.rds") + + assert obj is not None + assert len(obj) > 0 + + +def test_read_atomic_lists_nested_deep(): + obj = read_rds("tests/data/lists_nested_deep.rds") + + assert obj is not None + assert len(obj) > 0 + + +def test_read_atomic_lists_df(): + obj = read_rds("tests/data/lists_df.rds") + + assert obj is not None + assert len(obj) > 0 + + +def test_read_atomic_lists_nested_deep_rownames(): + obj = read_rds("tests/data/lists_df_rownames.rds") + + assert obj is not None + assert len(obj) > 0 diff --git a/tests/test_factors.py b/tests/test_factors.py new file mode 100644 index 0000000..96471fc --- /dev/null +++ b/tests/test_factors.py @@ -0,0 +1,16 @@ +import pytest + +from rds2py import read_rds + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + +## With attributes + + +def test_read_simple_factors(): + data = read_rds("tests/data/simple_factors.rds") + + assert data is not None + assert len(data) == 4 diff --git a/tests/test_frames.py b/tests/test_frames.py new file mode 100644 index 0000000..449150f --- /dev/null +++ b/tests/test_frames.py @@ -0,0 +1,24 @@ +import pytest + +from rds2py import read_rds +from biocframe import BiocFrame + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_read_atomic_lists_df(): + frame = read_rds("tests/data/lists_df.rds") + + assert frame is not None + assert isinstance(frame, BiocFrame) + assert len(frame) > 0 + + +def test_read_atomic_lists_nested_deep_rownames(): + frame = read_rds("tests/data/lists_df_rownames.rds") + + assert frame is not None + assert isinstance(frame, BiocFrame) + assert len(frame) > 0 diff --git a/tests/test_granges.py b/tests/test_granges.py index f64cf8a..97ee606 100644 --- a/tests/test_granges.py +++ b/tests/test_granges.py @@ -1,9 +1,9 @@ import pytest -from rds2py.granges import as_granges, as_granges_list -from rds2py.parser import read_rds +from rds2py import read_rds from genomicranges import GenomicRanges, GenomicRangesList +import numpy as np __author__ = "jkanche" __copyright__ = "jkanche" @@ -11,17 +11,28 @@ def test_granges(): - robj = read_rds("tests/data/granges.rds") - - gr = as_granges(robj=robj) + gr = read_rds("tests/data/granges.rds") assert isinstance(gr, GenomicRanges) + assert gr.get_seqnames("list") == [ + "chr1", + "chr2", + "chr2", + "chr2", + "chr1", + "chr1", + "chr3", + "chr3", + "chr3", + "chr3", + ] + assert np.allclose(gr.get_start(), range(101, 111)) + assert len(gr.get_mcols().get_column_names()) == 2 + assert gr.get_strand("list") == ["-", "+", "+", "*", "*", "+", "+", "+", "-", "-"] def test_granges_list(): - robj = read_rds("tests/data/grangeslist.rds") - - gr = as_granges_list(robj=robj) + gr = read_rds("tests/data/grangeslist.rds") assert isinstance(gr, GenomicRangesList) assert len(gr) == 5 diff --git a/tests/test_interface_matrix.py b/tests/test_interface_matrix.py deleted file mode 100644 index e279419..0000000 --- a/tests/test_interface_matrix.py +++ /dev/null @@ -1,34 +0,0 @@ -import pytest - -from rds2py.interface import as_dense_matrix, as_sparse_matrix -from rds2py.parser import read_rds -import numpy as np -from scipy import sparse as sp - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def test_read_s4_matrix_dgc(): - parsed_obj = read_rds("tests/data/s4_matrix.rds") - array = as_sparse_matrix(parsed_obj) - - assert array is not None - assert isinstance(array, sp.spmatrix) - - -def test_read_s4_matrix_dgt(): - parsed_obj = read_rds("tests/data/s4_matrix_dgt.rds") - array = as_sparse_matrix(parsed_obj) - - assert array is not None - assert isinstance(array, sp.spmatrix) - - -def test_read_dense_numpy_dtype(): - parsed_obj = read_rds("tests/data/numpy_dtype.rds") - array = as_dense_matrix(parsed_obj) - - assert array is not None - assert isinstance(array, np.ndarray) diff --git a/tests/test_list.py b/tests/test_list.py deleted file mode 100644 index fa5b012..0000000 --- a/tests/test_list.py +++ /dev/null @@ -1,47 +0,0 @@ -import pytest - -from rds2py.PyRdsReader import PyRdsParser - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def test_read_atomic_lists(): - parsed_obj = PyRdsParser("tests/data/lists.rds") - array = parsed_obj.parse() - - assert array is not None - assert len(array) > 0 - - -def test_read_atomic_lists_nested(): - parsed_obj = PyRdsParser("tests/data/lists_nested.rds") - array = parsed_obj.parse() - - assert array is not None - assert len(array) > 0 - - -def test_read_atomic_lists_nested_deep(): - parsed_obj = PyRdsParser("tests/data/lists_nested_deep.rds") - array = parsed_obj.parse() - - assert array is not None - assert len(array) > 0 - - -def test_read_atomic_lists_df(): - parsed_obj = PyRdsParser("tests/data/lists_df.rds") - array = parsed_obj.parse() - - assert array is not None - assert len(array) > 0 - - -def test_read_atomic_lists_nested_deep_rownames(): - parsed_obj = PyRdsParser("tests/data/lists_df_rownames.rds") - array = parsed_obj.parse() - - assert array is not None - assert len(array) > 0 diff --git a/tests/test_mae.py b/tests/test_mae.py new file mode 100644 index 0000000..485c55a --- /dev/null +++ b/tests/test_mae.py @@ -0,0 +1,17 @@ +import pytest + +from rds2py import read_rds + +from multiassayexperiment import MultiAssayExperiment + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_read_sce(): + data = read_rds("tests/data/simple_mae.rds") + + assert data is not None + assert isinstance(data, MultiAssayExperiment) + assert len(data.get_experiment_names()) == 2 diff --git a/tests/test_matrices.py b/tests/test_matrices.py new file mode 100644 index 0000000..52d59b9 --- /dev/null +++ b/tests/test_matrices.py @@ -0,0 +1,35 @@ +import pytest + +from rds2py import read_rds +import numpy as np +from scipy import sparse as sp + +from rds2py.read_matrix import MatrixWrapper + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_read_s4_matrix_dgc(): + array = read_rds("tests/data/s4_matrix.rds") + + assert array is not None + assert isinstance(array, sp.spmatrix) + + +def test_read_s4_matrix_dgt(): + array = read_rds("tests/data/s4_matrix_dgt.rds") + + assert array is not None + assert isinstance(array, sp.spmatrix) + + +def test_read_dense_numpy_dtype(): + array = read_rds("tests/data/numpy_dtype.rds") + + assert array is not None + assert isinstance(array, MatrixWrapper) + assert isinstance(array.matrix, np.ndarray) + assert array.dimnames is not None + assert len(array.dimnames) == len(array.matrix.shape) diff --git a/tests/test_rle.py b/tests/test_rle.py new file mode 100644 index 0000000..71acc84 --- /dev/null +++ b/tests/test_rle.py @@ -0,0 +1,18 @@ +import pytest + +from rds2py import read_rds + +from biocutils import BooleanList, FloatList, IntegerList, StringList + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + +## With attributes + + +def test_read_simple_rle(): + data = read_rds("tests/data/simple_rle.rds") + + assert data is not None + assert len(data) == 36 diff --git a/tests/test_s4.py b/tests/test_s4.py index 1dd7bda..78fbfa2 100644 --- a/tests/test_s4.py +++ b/tests/test_s4.py @@ -1,10 +1,10 @@ -import pytest +# import pytest from rds2py.PyRdsReader import PyRdsParser -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" +# __author__ = "jkanche" +# __copyright__ = "jkanche" +# __license__ = "MIT" def test_read_s4_class(): diff --git a/tests/test_sce.py b/tests/test_sce.py new file mode 100644 index 0000000..6edaa0a --- /dev/null +++ b/tests/test_sce.py @@ -0,0 +1,17 @@ +import pytest + +from rds2py import read_rds + +from singlecellexperiment import SingleCellExperiment + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_read_sce(): + data = read_rds("tests/data/simple_sce.rds") + + assert data is not None + assert isinstance(data, SingleCellExperiment) + assert data.shape == (100, 100) diff --git a/tests/test_se.py b/tests/test_se.py new file mode 100644 index 0000000..8ceb6b5 --- /dev/null +++ b/tests/test_se.py @@ -0,0 +1,25 @@ +import pytest + +from rds2py import read_rds + +from summarizedexperiment import SummarizedExperiment, RangedSummarizedExperiment + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_read_summ_expt(): + data = read_rds("tests/data/sumexpt.rds") + + assert data is not None + assert isinstance(data, SummarizedExperiment) + assert data.shape == (200, 6) + + +def test_read_ranged_summ_expt(): + data = read_rds("tests/data/ranged_se.rds") + + assert data is not None + assert isinstance(data, RangedSummarizedExperiment) + assert data.shape == (200, 6)