diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 3c148a9..a65f9cb 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -31,6 +31,6 @@ set_property(TARGET ${TARGET} PROPERTY CXX_STANDARD 17) target_link_libraries(${TARGET} PRIVATE rds2cpp pybind11::pybind11) set_target_properties(${TARGET} PROPERTIES - OUTPUT_NAME lib_rds + OUTPUT_NAME rds_parser PREFIX "" ) diff --git a/lib/src/rdswrapper.cpp b/lib/src/rdswrapper.cpp index 988882a..647e987 100644 --- a/lib/src/rdswrapper.cpp +++ b/lib/src/rdswrapper.cpp @@ -2,453 +2,227 @@ #include #include #include "rds2cpp/rds2cpp.hpp" -#include namespace py = pybind11; -// Interface methods to Parser Object - -inline uintptr_t py_parser_rds_file(std::string file) { - rds2cpp::Parsed res = rds2cpp::parse_rds(file); - - return reinterpret_cast(new rds2cpp::Parsed(std::move(res))); -} - -inline uintptr_t py_parser_extract_robject(uintptr_t ptr) { - auto parsed = reinterpret_cast(ptr); - return reinterpret_cast(parsed->object.get()); -} - -// probably don't need this, mostly for testing -inline void py_read_parsed_ptr(uintptr_t ptr) { - auto parsed = reinterpret_cast(ptr); -} - -// Interface Methods to RObject - -inline std::string py_robject_extract_type(uintptr_t ptr) { - auto parsed = reinterpret_cast(ptr); - switch (parsed->type()) { - case rds2cpp::SEXPType::INT: - return "integer"; - case rds2cpp::SEXPType::REAL: - return "double"; - case rds2cpp::SEXPType::STR: - return "string"; - case rds2cpp::SEXPType::LGL: - return "boolean"; - case rds2cpp::SEXPType::VEC: - return "vector"; - case rds2cpp::SEXPType::S4: - return "S4"; - case rds2cpp::SEXPType::NIL: - return "null"; - default: - break; - } - return "other"; -} - -template -int _size_(const rds2cpp::RObject* ptr) { - auto xptr = static_cast(ptr); - return xptr->data.size(); -} - -inline int py_robject_extract_size(uintptr_t ptr) { - auto parsed = reinterpret_cast(ptr); - switch (parsed->type()) { - case rds2cpp::SEXPType::INT: - return _size_(parsed); - case rds2cpp::SEXPType::REAL: - return _size_(parsed); - case rds2cpp::SEXPType::STR: - return _size_(parsed); - case rds2cpp::SEXPType::LGL: - return _size_(parsed); - case rds2cpp::SEXPType::VEC: - return _size_(parsed); - default: - break; - } - return -1; -} - -template -uintptr_t _get_vector_ptr(const rds2cpp::RObject* ptr) { - auto xptr = static_cast(ptr); - return reinterpret_cast(xptr->data.data()); -} - -inline uintptr_t parse_robject_int_vector(uintptr_t ptr) { - auto parsed = reinterpret_cast(ptr); - switch (parsed->type()) { - case rds2cpp::SEXPType::INT: - return _get_vector_ptr(parsed); - case rds2cpp::SEXPType::LGL: - return _get_vector_ptr(parsed); - case rds2cpp::SEXPType::REAL: - return _get_vector_ptr(parsed); - default: - break; - } - throw std::runtime_error("cannot obtain numeric values for non-numeric RObject type"); - return _get_vector_ptr(parsed); // avoid compiler warning. -} - -// inline uintptr_t parse_robject_double_vector(uintptr_t ptr) { -// auto parsed = reinterpret_cast(ptr); -// switch (parsed->type()) { -// case rds2cpp::SEXPType::REAL: -// return _get_vector_ptr(parsed); -// default: -// break; -// } -// throw std::runtime_error("cannot obtain numeric values for non-numeric RObject type"); -// return _get_vector_ptr(parsed); // avoid compiler warning. -// } - -inline std::vector parse_robject_string_vector(uintptr_t ptr) { - auto parsed = reinterpret_cast(ptr); - if (parsed->type() != rds2cpp::SEXPType::STR) { - throw std::runtime_error("cannot return string values for non-string RObject type"); - } - auto sptr = static_cast(parsed); - - return sptr->data; -} - -template -const rds2cpp::Attributes& _get_attr_ptr(const rds2cpp::RObject* ptr) { - auto aptr = static_cast(ptr); - return aptr->attributes; -} - -inline std::vector parse_robject_attribute_names(uintptr_t ptr) { - auto parsed = reinterpret_cast(ptr); - switch (parsed->type()) { - case rds2cpp::SEXPType::INT: - return _get_attr_ptr(parsed).names; - break; - case rds2cpp::SEXPType::REAL: - return _get_attr_ptr(parsed).names; - break; - case rds2cpp::SEXPType::LGL: - return _get_attr_ptr(parsed).names; - break; - case rds2cpp::SEXPType::VEC: - return _get_attr_ptr(parsed).names; - break; - case rds2cpp::SEXPType::S4: - return _get_attr_ptr(parsed).names; - break; - default: - break; - } - return _get_attr_ptr(parsed).names; // avoid compiler warning. -} - -template -int _contains_attr_(const rds2cpp::RObject* ptr, const std::string& name) { - auto aptr = static_cast(ptr); - const auto& attr_names = aptr->attributes.names; - - for (size_t i = 0; i < attr_names.size(); ++i) { - if (attr_names[i] == name) { - return i; - } - } - - return -1; -} - -inline int parse_robject_find_attribute(uintptr_t ptr, std::string name) { - auto parsed = reinterpret_cast(ptr); - switch (parsed->type()) { - case rds2cpp::SEXPType::INT: - return _contains_attr_(parsed, name); - case rds2cpp::SEXPType::REAL: - return _contains_attr_(parsed, name); - case rds2cpp::SEXPType::LGL: - return _contains_attr_(parsed, name); - case rds2cpp::SEXPType::STR: - return _contains_attr_(parsed, name); - case rds2cpp::SEXPType::VEC: - return _contains_attr_(parsed, name); - case rds2cpp::SEXPType::S4: - return _contains_attr_(parsed, name); - default: - break; - } - return -1; -} - -template -uintptr_t _load_attr_idx_(const rds2cpp::RObject* ptr, int i) { - auto aptr = static_cast(ptr); - if (static_cast(i) >= aptr->attributes.values.size()) { - throw std::runtime_error("requested attribute index " + std::to_string(i) + " is out of range"); - } - const auto& chosen = aptr->attributes.values[i]; - return reinterpret_cast(chosen.get()); -} - -inline uintptr_t parse_robject_load_attribute_by_index(uintptr_t ptr, int i) { - auto parsed = reinterpret_cast(ptr); - switch (parsed->type()) { - case rds2cpp::SEXPType::INT: - return _load_attr_idx_(parsed, i); - case rds2cpp::SEXPType::REAL: - return _load_attr_idx_(parsed, i); - case rds2cpp::SEXPType::LGL: - return _load_attr_idx_(parsed, i); - case rds2cpp::SEXPType::STR: - return _load_attr_idx_(parsed, i); - case rds2cpp::SEXPType::VEC: - return _load_attr_idx_(parsed, i); - case rds2cpp::SEXPType::S4: - return _load_attr_idx_(parsed, i); - default: - break; - } - - throw std::runtime_error("unsupported R object type"); - return _load_attr_idx_(parsed, i); // avoid compiler warnings. -} - -inline uintptr_t parse_robject_load_attribute_by_name(uintptr_t ptr, std::string name) { - auto parsed = reinterpret_cast(ptr); - int idx = parse_robject_find_attribute(ptr, name); - if (idx < 0) { - throw std::runtime_error("no attribute named '" + name + "'"); - } - return parse_robject_load_attribute_by_index(ptr, idx); -} - -inline uintptr_t parse_robject_load_vec_element(uintptr_t ptr, int i) { - auto parsed = reinterpret_cast(ptr); - if (parsed->type() != rds2cpp::SEXPType::VEC) { - throw std::runtime_error("cannot return list element for non-list R object"); - } - auto lptr = static_cast(parsed); - return reinterpret_cast(lptr->data[i].get()); -} - -inline std::string parse_robject_class_name(uintptr_t ptr) { - auto parsed = reinterpret_cast(ptr); - if (parsed->type() != rds2cpp::SEXPType::S4) { - throw std::runtime_error("cannot return class name for non-S4 R object"); - } - auto sptr = static_cast(parsed); - return sptr->class_name; -} - -inline std::string parse_robject_package_name(uintptr_t ptr) { - auto parsed = reinterpret_cast(ptr); - if (parsed->type() != rds2cpp::SEXPType::S4) { - throw std::runtime_error("cannot return class name for non-S4 R object"); - } - auto sptr = static_cast(parsed); - return sptr->package_name; -} - -inline std::pair parse_robject_dimensions(uintptr_t ptr) { - auto dimobj = reinterpret_cast(ptr); - if (dimobj->type() != rds2cpp::SEXPType::INT) { - throw std::runtime_error("expected matrix dimensions to be integer"); - } - - auto dimvec = static_cast(dimobj); - const auto& dims = dimvec->data; - if (dims.size() != 2) { - throw std::runtime_error("expected matrix dimensions to be of length 2"); - } - if (dims[0] < 0 || dims[1] < 0) { - throw std::runtime_error("expected all matrix dimensions to be non-negative"); - } - - return std::pair(dims[0], dims[1]); -} - -// Class definitions - -class PyRdsReader { +class RdsObject { private: - const rds2cpp::RObject* ptr; - std::string rtype; - int rsize; + std::unique_ptr ptr; public: - static constexpr int R_MIN = -2147483648; - - PyRdsReader(const rds2cpp::RObject* p) : ptr(p) { - get_rtype(); - get_rsize(); - } - - std::string get_rtype() { - if (rtype.empty()) { - rtype = py_robject_extract_type(reinterpret_cast(ptr)); + RdsObject(const rds2cpp::RObject* p) : ptr(p) { + if (!ptr) { + throw std::runtime_error("Null pointer passed to RdsObject"); } - return rtype; } - int get_rsize() { - if (rsize == 0) { - rsize = py_robject_extract_size(reinterpret_cast(ptr)); + std::string get_type() const { + if (!ptr) { + throw std::runtime_error("Null pointer in get_type"); + } + switch (ptr->type()) { + case rds2cpp::SEXPType::INT: return "integer"; + case rds2cpp::SEXPType::REAL: return "double"; + case rds2cpp::SEXPType::STR: return "string"; + case rds2cpp::SEXPType::LGL: return "boolean"; + case rds2cpp::SEXPType::VEC: return "vector"; + case rds2cpp::SEXPType::S4: return "S4"; + case rds2cpp::SEXPType::NIL: return "null"; + default: return "other"; } - return rsize; } - py::object realize_value() { - py::dict result; - result["rtype"] = rtype; - - if (rtype == "integer" || rtype == "boolean") { - result["data"] = _get_int_or_bool_arr(); - result["attributes"] = realize_attr_value(); - } else if (rtype == "double") { - result["data"] = _get_double_arr(); - result["attributes"] = realize_attr_value(); - } else if (rtype == "string") { - result["data"] = _get_string_arr(); - } else if (rtype == "vector") { - result["data"] = _get_vector_arr(); - result["attributes"] = realize_attr_value(); - } else if (rtype == "null") { - return result; - } else if (rtype == "S4") { - result["package_name"] = get_package_name(); - result["class_name"] = get_class_name(); - result["attributes"] = realize_attr_value(); - return result; - } else { - throw std::runtime_error("Cannot realize object of type: " + rtype); + int get_size() const { + if (!ptr) { + throw std::runtime_error("Null pointer in get_size"); + } + switch (ptr->type()) { + case rds2cpp::SEXPType::INT: + return static_cast(ptr.get())->data.size(); + case rds2cpp::SEXPType::REAL: + return static_cast(ptr.get())->data.size(); + case rds2cpp::SEXPType::STR: + return static_cast(ptr.get())->data.size(); + case rds2cpp::SEXPType::LGL: + return static_cast(ptr.get())->data.size(); + case rds2cpp::SEXPType::VEC: + return static_cast(ptr.get())->data.size(); + default: + return -1; } - - return shennanigans_to_py_reprs(result); } - py::object shennanigans_to_py_reprs(py::dict result) { - if (rtype == "integer") { - py::array_t data = result["data"].cast>(); - if (rsize == 2 && data.at(0) == R_MIN && data.at(1) < 0) { - // Create a Python range object manually - py::object range_func = py::module::import("builtins").attr("range"); - result["data"] = range_func(data.at(1) * -1); + py::array get_numeric_data() const { + if (!ptr) { + throw std::runtime_error("Null pointer in get_numeric_data"); + } + switch (ptr->type()) { + case rds2cpp::SEXPType::INT: { + auto vec = static_cast(ptr.get()); + return py::array_t({vec->data.size()}, {sizeof(int)}, vec->data.data()); + } + case rds2cpp::SEXPType::REAL: { + auto vec = static_cast(ptr.get()); + return py::array_t({vec->data.size()}, {sizeof(double)}, vec->data.data()); } + case rds2cpp::SEXPType::LGL: { + auto vec = static_cast(ptr.get()); + return py::array_t({vec->data.size()}, {sizeof(int)}, vec->data.data()); + } + default: + throw std::runtime_error("Cannot get numeric data from non-numeric type"); } - return result; } - py::array _get_int_or_bool_arr() { - if (rsize == 0) { - return py::array_t(); + std::vector get_string_data() const { + if (!ptr) { + throw std::runtime_error("Null pointer in get_string_data"); } - uintptr_t arr_ptr = parse_robject_int_vector(reinterpret_cast(ptr)); - return py::array_t({rsize}, {sizeof(int)}, reinterpret_cast(arr_ptr)); - } - - py::array _get_double_arr() { - if (rsize == 0) { - return py::array_t(); + if (ptr->type() != rds2cpp::SEXPType::STR) { + throw std::runtime_error("Cannot get string data from non-string type"); } - uintptr_t arr_ptr = parse_robject_int_vector(reinterpret_cast(ptr)); - return py::array_t({rsize}, {sizeof(double)}, reinterpret_cast(arr_ptr)); - } - - py::list _get_string_arr() { - std::vector arr_str = parse_robject_string_vector(reinterpret_cast(ptr)); - return py::cast(arr_str); - } - - py::list _get_vector_arr() { - py::list vec; - for (int i = 0; i < rsize; ++i) { - uintptr_t elem_ptr = parse_robject_load_vec_element(reinterpret_cast(ptr), i); - PyRdsReader elem_reader(reinterpret_cast(elem_ptr)); - vec.append(elem_reader.realize_value()); + return static_cast(ptr.get())->data; + } + + std::vector get_attribute_names() const { + switch (ptr->type()) { + case rds2cpp::SEXPType::INT: + return static_cast(ptr.get())->attributes.names; + case rds2cpp::SEXPType::REAL: + return static_cast(ptr.get())->attributes.names; + case rds2cpp::SEXPType::LGL: + return static_cast(ptr.get())->attributes.names; + case rds2cpp::SEXPType::VEC: + return static_cast(ptr.get())->attributes.names; + case rds2cpp::SEXPType::S4: + return static_cast(ptr.get())->attributes.names; + default: + return std::vector(); } - return vec; - } - - py::list get_attribute_names() { - return py::cast(parse_robject_attribute_names(reinterpret_cast(ptr))); - } - - int find_attribute(const std::string& name) { - return parse_robject_find_attribute(reinterpret_cast(ptr), name); } - PyRdsReader load_attribute_by_index(int index) { - uintptr_t tmp = parse_robject_load_attribute_by_index(reinterpret_cast(ptr), index); - return PyRdsReader(reinterpret_cast(tmp)); - } - - PyRdsReader load_attribute_by_name(const std::string& name) { - uintptr_t tmp = parse_robject_load_attribute_by_name(reinterpret_cast(ptr), name); - return PyRdsReader(reinterpret_cast(tmp)); - } - - PyRdsReader load_vec_element(int i) { - uintptr_t tmp = parse_robject_load_vec_element(reinterpret_cast(ptr), i); - return PyRdsReader(reinterpret_cast(tmp)); + RdsObject get_attribute(const std::string& name) const { + if (!ptr) { + throw std::runtime_error("Null pointer in get_attribute"); + } + const auto& names = get_attribute_names(); + auto it = std::find(names.begin(), names.end(), name); + if (it == names.end()) { + throw std::runtime_error("Attribute not found: " + name); + } + size_t idx = std::distance(names.begin(), it); + + const rds2cpp::RObject* attr_ptr = nullptr; + switch (ptr->type()) { + case rds2cpp::SEXPType::INT: + attr_ptr = static_cast(ptr.get())->attributes.values[idx].get(); + break; + case rds2cpp::SEXPType::REAL: + attr_ptr = static_cast(ptr.get())->attributes.values[idx].get(); + break; + case rds2cpp::SEXPType::LGL: + attr_ptr = static_cast(ptr.get())->attributes.values[idx].get(); + break; + case rds2cpp::SEXPType::VEC: + attr_ptr = static_cast(ptr.get())->attributes.values[idx].get(); + break; + case rds2cpp::SEXPType::S4: + attr_ptr = static_cast(ptr.get())->attributes.values[idx].get(); + break; + default: + throw std::runtime_error("Cannot get attributes from this type"); + } + return RdsObject(attr_ptr); } - std::string get_package_name() { - if (rtype != "S4") { - throw std::runtime_error("package name does not exist on non-S4 classes"); + RdsObject get_vector_element(size_t idx) const { + if (!ptr) { + throw std::runtime_error("Null pointer in get_vector_element"); + } + if (ptr->type() != rds2cpp::SEXPType::VEC) { + throw std::runtime_error("Cannot get vector element from non-vector type"); + } + auto vec = static_cast(ptr.get()); + if (idx >= vec->data.size()) { + throw std::runtime_error("Index out of bounds"); } - return parse_robject_package_name(reinterpret_cast(ptr)); + return RdsObject(vec->data[idx].get()); } - std::string get_class_name() { - return parse_robject_class_name(reinterpret_cast(ptr)); + std::string get_class_name() const { + if (!ptr) { + throw std::runtime_error("Null pointer in get_class_name"); + } + if (ptr->type() != rds2cpp::SEXPType::S4) { + throw std::runtime_error("Cannot get class name from non-S4 type"); + } + return static_cast(ptr.get())->class_name; } - std::pair get_dimensions() { - return parse_robject_dimensions(reinterpret_cast(ptr)); + std::string get_package_name() const { + if (!ptr) { + throw std::runtime_error("Null pointer in get_package_name"); + } + if (ptr->type() != rds2cpp::SEXPType::S4) { + throw std::runtime_error("Cannot get package name from non-S4 type"); + } + return static_cast(ptr.get())->package_name; } - py::dict realize_attr_value() { - py::dict result; - for (const auto& ro_attr : get_attribute_names()) { - PyRdsReader tmp_obj = load_attribute_by_name(ro_attr.cast()); - result[ro_attr] = tmp_obj.realize_value(); + std::pair get_dimensions() const { + if (!ptr) { + throw std::runtime_error("Null pointer in get_dimensions"); + } + if (ptr->type() != rds2cpp::SEXPType::INT) { + throw std::runtime_error("Cannot get dimensions from non-integer type"); } - return result; + auto vec = static_cast(ptr.get()); + if (vec->data.size() != 2) { + throw std::runtime_error("Dimensions must be length 2"); + } + return {static_cast(vec->data[0]), static_cast(vec->data[1])}; } }; -class PyRdsObject { +class RdsParser { private: std::unique_ptr parsed; public: - PyRdsObject(const std::string& file) : parsed(std::make_unique(rds2cpp::parse_rds(file))) {} + RdsParser(const std::string& filename) { + try { + parsed = std::make_unique(rds2cpp::parse_rds(filename)); + } catch (const std::exception& e) { + throw std::runtime_error("Failed to parse RDS file: " + std::string(e.what())); + } + } - PyRdsReader get_robject() { - return PyRdsReader(parsed->object.get()); + RdsObject get_object() const { + if (!parsed || !parsed->object) { + throw std::runtime_error("No valid RDS object available"); + } + return RdsObject(parsed->object.get()); } }; -PYBIND11_MODULE(lib_rds, m) { - py::class_(m, "PyRdsObject") - .def(py::init()) - .def("get_robject", &PyRdsObject::get_robject); +PYBIND11_MODULE(rds_parser, m) { + m.doc() = "Python bindings for rds2cpp library"; + + py::register_exception(m, "RdsParserError"); - py::class_(m, "PyRdsReader") - .def(py::init()) - .def("get_rtype", &PyRdsReader::get_rtype) - .def("get_rsize", &PyRdsReader::get_rsize) - .def("realize_value", &PyRdsReader::realize_value) - .def("get_attribute_names", &PyRdsReader::get_attribute_names) - .def("find_attribute", &PyRdsReader::find_attribute) - .def("load_attribute_by_index", &PyRdsReader::load_attribute_by_index) - .def("load_attribute_by_name", &PyRdsReader::load_attribute_by_name) - .def("load_vec_element", &PyRdsReader::load_vec_element) - .def("get_package_name", &PyRdsReader::get_package_name) - .def("get_class_name", &PyRdsReader::get_class_name) - .def("get_dimensions", &PyRdsReader::get_dimensions) - .def("realize_attr_value", &PyRdsReader::realize_attr_value); + py::class_(m, "RdsObject") + .def("get_type", &RdsObject::get_type) + .def("get_size", &RdsObject::get_size) + .def("get_numeric_data", &RdsObject::get_numeric_data) + .def("get_string_data", &RdsObject::get_string_data) + .def("get_attribute_names", &RdsObject::get_attribute_names) + .def("get_attribute", &RdsObject::get_attribute) + .def("get_vector_element", &RdsObject::get_vector_element) + .def("get_class_name", &RdsObject::get_class_name) + .def("get_package_name", &RdsObject::get_package_name) + .def("get_dimensions", &RdsObject::get_dimensions); + + py::class_(m, "RdsParser") + .def(py::init()) + .def("get_object", &RdsParser::get_object); } \ No newline at end of file diff --git a/src/rds2py/PyRdsReader.py b/src/rds2py/PyRdsReader.py new file mode 100644 index 0000000..b29c12d --- /dev/null +++ b/src/rds2py/PyRdsReader.py @@ -0,0 +1,148 @@ +from typing import Dict, List, Union, Tuple +import numpy as np +from .rds_parser import RdsParser, RdsObject, RdsParserError + +class PyRdsValue: + """Python wrapper for RDS values""" + def __init__(self, obj: RdsObject): + self.obj = obj + try: + self._type = obj.get_type() + self._size = obj.get_size() + except RdsParserError as e: + raise ValueError(f"Failed to initialize PyRdsValue: {str(e)}") + + def realize_value(self) -> Dict: + """Convert the RDS object into a Python dictionary representation""" + try: + result = {"rtype": self._type} + + if self._type in ["integer", "boolean", "double"]: + result["data"] = self._get_numeric_data() + result["attributes"] = self._get_attributes() + result["class_name"] = f"{self._type}_vector" + + elif self._type == "string": + result["data"] = self.obj.get_string_data() + result["class_name"] = "string_vector" + + elif self._type == "vector": + result["data"] = self._get_vector_data() + result["attributes"] = self._get_attributes() + result["class_name"] = "vector" + + elif self._type == "S4": + result["package_name"] = self.obj.get_package_name() + result["class_name"] = self.obj.get_class_name() + result["attributes"] = self._get_attributes() + + elif self._type == "null": + pass + + else: + raise ValueError(f"Unsupported R object type: {self._type}") + + return self._handle_special_cases(result) + except RdsParserError as e: + raise ValueError(f"Failed to realize value: {str(e)}") + + def _get_numeric_data(self) -> np.ndarray: + """Get numeric data from the RDS object""" + try: + return self.obj.get_numeric_data() + except RdsParserError as e: + raise ValueError(f"Failed to get numeric data: {str(e)}") + + def _get_vector_data(self) -> List: + """Get vector data from the RDS object""" + try: + return [PyRdsValue(self.obj.get_vector_element(i)).realize_value() + for i in range(self._size)] + except RdsParserError as e: + raise ValueError(f"Failed to get vector data: {str(e)}") + + def _get_attributes(self) -> Dict: + """Get attributes from the RDS object""" + try: + result = {} + for name in self.obj.get_attribute_names(): + attr_obj = self.obj.get_attribute(name) + result[name] = PyRdsValue(attr_obj).realize_value() + return result + except RdsParserError as e: + raise ValueError(f"Failed to get attributes: {str(e)}") + + def _handle_special_cases(self, result: Dict) -> Dict: + """Handle special cases like R's NA values and ranges""" + if self._type == "integer" and self._size == 2: + data = result.get("data") + if data is not None and len(data) == 2: + if data[0] == -2147483648 and data[1] < 0: # R's NA value + result["data"] = range(-data[1]) + return result + +def get_dimensions(self) -> Tuple[int, int]: + """Get dimensions of the RDS object""" + try: + return self.obj.get_dimensions() + except RdsParserError as e: + raise ValueError(f"Failed to get dimensions: {str(e)}") + +class PyRdsReader: + """Main class for reading RDS files""" + def __init__(self, filename: str): + try: + self.parser = RdsParser(filename) + self.root_object = self.parser.get_object() + except RdsParserError as e: + raise IOError(f"Failed to initialize RDS parser: {str(e)}") + + def read(self) -> Dict: + """Read and parse the RDS file""" + try: + return PyRdsValue(self.root_object).realize_value() + except ValueError as e: + raise IOError(f"Failed to read RDS file: {str(e)}") + + def get_type(self) -> str: + """Get the type of the root RDS object""" + try: + return self.root_object.get_type() + except RdsParserError as e: + raise ValueError(f"Failed to get root object type: {str(e)}") + + def get_size(self) -> int: + """Get the size of the root RDS object""" + try: + return self.root_object.get_size() + except RdsParserError as e: + raise ValueError(f"Failed to get root object size: {str(e)}") + + def get_attribute_names(self) -> List[str]: + """Get attribute names of the root RDS object""" + try: + return self.root_object.get_attribute_names() + except RdsParserError as e: + raise ValueError(f"Failed to get root object attribute names: {str(e)}") + + def get_attribute(self, name: str) -> Dict: + """Get a specific attribute of the root RDS object""" + try: + attr_obj = self.root_object.get_attribute(name) + return PyRdsValue(attr_obj).realize_value() + except RdsParserError as e: + raise ValueError(f"Failed to get attribute '{name}': {str(e)}") + + def get_class_name(self) -> str: + """Get the class name of the root RDS object (for S4 objects)""" + try: + return self.root_object.get_class_name() + except RdsParserError as e: + raise ValueError(f"Failed to get root object class name: {str(e)}") + + def get_package_name(self) -> str: + """Get the package name of the root RDS object (for S4 objects)""" + try: + return self.root_object.get_package_name() + except RdsParserError as e: + raise ValueError(f"Failed to get root object package name: {str(e)}") diff --git a/src/rds2py/parser.py b/src/rds2py/parser.py index 459e64d..ac5621e 100644 --- a/src/rds2py/parser.py +++ b/src/rds2py/parser.py @@ -1,6 +1,6 @@ from typing import Dict, MutableMapping -from . import lib_rds as lib +from .PyRdsReader import PyRdsReader __author__ = "jkanche" __copyright__ = "jkanche" @@ -16,9 +16,8 @@ def read_rds(file: str) -> Dict: Returns: MutableMapping: R object as a python dictionary. """ - parsed_obj = lib.PyRdsObject(file) - robject_obj = parsed_obj.get_robject() - realized = robject_obj.realize_value() + parsed_obj = PyRdsReader(file) + realized = parsed_obj.read() return realized diff --git a/tests/test_atomic-attr.py b/tests/test_atomic-attr.py index c15b536..50d83fa 100644 --- a/tests/test_atomic-attr.py +++ b/tests/test_atomic-attr.py @@ -1,6 +1,6 @@ import pytest -from rds2py.lib_rds import PyRdsObject +from rds2py.PyRdsReader import PyRdsReader __author__ = "jkanche" __copyright__ = "jkanche" @@ -8,13 +8,10 @@ def test_read_atomic_attrs(): - parsed_obj = PyRdsObject("tests/data/atomic_attr.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() - attr_names = robject_obj.get_attribute_names() - attr_values = robject_obj.realize_attr_value() + parsed_obj = PyRdsReader("tests/data/atomic_attr.rds") + data = parsed_obj.read() - assert array is not None - assert len(array) > 0 - assert len(attr_names) is not None - assert len(attr_values) is not None + assert data is not None + assert len(data["data"]) > 0 + assert len(data["attributes"]) >0 + assert len(data["attributes"]["names"]["data"]) >0 diff --git a/tests/test_atomic-bool.py b/tests/test_atomic-bool.py index 1dcd029..e84182a 100644 --- a/tests/test_atomic-bool.py +++ b/tests/test_atomic-bool.py @@ -1,6 +1,6 @@ import pytest -from rds2py.lib_rds import PyRdsObject +from rds2py.PyRdsReader import PyRdsReader __author__ = "jkanche" __copyright__ = "jkanche" @@ -8,18 +8,16 @@ def test_read_atomic_logical(): - parsed_obj = PyRdsObject("tests/data/atomic_logical.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() + parsed_obj = PyRdsReader("tests/data/atomic_logical.rds") + array = parsed_obj.read() assert array is not None assert array["data"].shape[0] > 0 def test_read_atomic_logical_na(): - parsed_obj = PyRdsObject("tests/data/atomic_logical_wNA.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() + parsed_obj = PyRdsReader("tests/data/atomic_logical_wNA.rds") + array = parsed_obj.read() assert array is not None assert array["data"].shape[0] > 0 diff --git a/tests/test_atomic-double.py b/tests/test_atomic-double.py index 75a5fe3..28a65cb 100644 --- a/tests/test_atomic-double.py +++ b/tests/test_atomic-double.py @@ -1,6 +1,6 @@ import pytest -from rds2py.lib_rds import PyRdsObject +from rds2py.PyRdsReader import PyRdsReader __author__ = "jkanche" __copyright__ = "jkanche" @@ -8,9 +8,8 @@ def test_read_atomic_double(): - parsed_obj = PyRdsObject("tests/data/atomic_double.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() + parsed_obj = PyRdsReader("tests/data/atomic_double.rds") + array = parsed_obj.read() assert array is not None print(array) diff --git a/tests/test_atomic-int.py b/tests/test_atomic-int.py index 2a47815..f232bee 100644 --- a/tests/test_atomic-int.py +++ b/tests/test_atomic-int.py @@ -1,6 +1,6 @@ import pytest -from rds2py.lib_rds import PyRdsObject +from rds2py.PyRdsReader import PyRdsReader __author__ = "jkanche" __copyright__ = "jkanche" @@ -8,9 +8,8 @@ def test_read_atomic_ints(): - parsed_obj = PyRdsObject("tests/data/atomic_ints.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() + parsed_obj = PyRdsReader("tests/data/atomic_ints.rds") + array = parsed_obj.read() assert array is not None print(array) diff --git a/tests/test_atomic-str.py b/tests/test_atomic-str.py index d6281e8..3c0e84f 100644 --- a/tests/test_atomic-str.py +++ b/tests/test_atomic-str.py @@ -1,6 +1,6 @@ import pytest -from rds2py.lib_rds import PyRdsObject +from rds2py.PyRdsReader import PyRdsReader __author__ = "jkanche" __copyright__ = "jkanche" @@ -8,18 +8,16 @@ def test_read_atomic_chars(): - parsed_obj = PyRdsObject("tests/data/atomic_chars.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() + parsed_obj = PyRdsReader("tests/data/atomic_chars.rds") + array = parsed_obj.read() assert array is not None assert len(array["data"]) == 26 def test_read_atomic_chars_unicode(): - parsed_obj = PyRdsObject("tests/data/atomic_chars_unicode.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() + parsed_obj = PyRdsReader("tests/data/atomic_chars_unicode.rds") + array = parsed_obj.read() assert array is not None assert len(array["data"]) == 4 diff --git a/tests/test_list.py b/tests/test_list.py index 4390ab9..247ca6e 100644 --- a/tests/test_list.py +++ b/tests/test_list.py @@ -1,6 +1,6 @@ import pytest -from rds2py.lib_rds import PyRdsObject +from rds2py.PyRdsReader import PyRdsReader __author__ = "jkanche" __copyright__ = "jkanche" @@ -9,8 +9,7 @@ def test_read_atomic_lists(): parsed_obj = PyRdsObject("tests/data/lists.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() + array = parsed_obj.read() assert array is not None assert len(array) > 0 @@ -18,8 +17,7 @@ def test_read_atomic_lists(): def test_read_atomic_lists_nested(): parsed_obj = PyRdsObject("tests/data/lists_nested.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() + array = parsed_obj.read() assert array is not None assert len(array) > 0 @@ -27,8 +25,7 @@ def test_read_atomic_lists_nested(): def test_read_atomic_lists_nested_deep(): parsed_obj = PyRdsObject("tests/data/lists_nested_deep.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() + array = parsed_obj.read() assert array is not None assert len(array) > 0 @@ -36,8 +33,7 @@ def test_read_atomic_lists_nested_deep(): def test_read_atomic_lists_df(): parsed_obj = PyRdsObject("tests/data/lists_df.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() + array = parsed_obj.read() assert array is not None assert len(array) > 0 @@ -45,8 +41,7 @@ def test_read_atomic_lists_df(): def test_read_atomic_lists_nested_deep_rownames(): parsed_obj = PyRdsObject("tests/data/lists_df_rownames.rds") - robject_obj = parsed_obj.get_robject() - array = robject_obj.realize_value() - + array = parsed_obj.read() + assert array is not None assert len(array) > 0