diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b25eb5..d5ef26d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.24) project(rds2cpp - VERSION 1.0.1 + VERSION 1.1.0 DESCRIPTION "Standalone C++ library for reading RDS files" LANGUAGES CXX) diff --git a/include/rds2cpp/parse_altrep.hpp b/include/rds2cpp/parse_altrep.hpp index a4bd04a..c6750bd 100644 --- a/include/rds2cpp/parse_altrep.hpp +++ b/include/rds2cpp/parse_altrep.hpp @@ -14,28 +14,28 @@ namespace rds2cpp { -template -IntegerVector parse_integer_body(Reader&, std::vector&); +template +IntegerVector parse_integer_body(Source_&); -template -DoubleVector parse_double_body(Reader& reader, std::vector&); +template +DoubleVector parse_double_body(Source_& src); -template -std::unique_ptr parse_object(Reader&, std::vector&, SharedParseInfo&); +template +std::unique_ptr parse_object(Source_&, SharedParseInfo&); -template -PairList parse_pairlist_body(Reader&, std::vector&, const Header&, SharedParseInfo&); +template +PairList parse_pairlist_body(Source_&, const Header&, SharedParseInfo&); namespace altrep_internal { -template -Vector parse_numeric_compact_seq(Reader& reader, std::vector& leftovers) try { - auto header = parse_header(reader, leftovers); +template +Vector parse_numeric_compact_seq(Source_& src) try { + auto header = parse_header(src); if (header[3] != static_cast(SEXPType::REAL)) { throw std::runtime_error("expected compact_seq to store sequence information in doubles"); } - auto info = parse_double_body(reader, leftovers); + auto info = parse_double_body(src); const auto& ranges = info.data; if (ranges.size() != 3) { throw std::runtime_error("expected compact_seq's sequence information to be of length 3"); @@ -49,7 +49,7 @@ Vector parse_numeric_compact_seq(Reader& reader, std::vector& lef output.data[i] = start; } - auto terminator = parse_header(reader, leftovers); + auto terminator = parse_header(src); if (terminator[3] != 254) { throw std::runtime_error("failed to terminate a compact_seq ALTREP correctly"); } @@ -59,36 +59,36 @@ Vector parse_numeric_compact_seq(Reader& reader, std::vector& lef throw traceback("failed to parse compact numeric ALTREP", e); } -template -Vector parse_attribute_wrapper(Reader& reader, std::vector& leftovers, SharedParseInfo& shared) try { - auto plist_header = parse_header(reader, leftovers); +template +Vector parse_attribute_wrapper(Source_& src, SharedParseInfo& shared) try { + auto plist_header = parse_header(src); if (plist_header[3] != static_cast(SEXPType::LIST)) { throw std::runtime_error("expected pairlist in wrap_* ALTREP's payload"); } // First pairlist element is a CONS cell where the first value is the wrapped integer vector. - auto contents = parse_object(reader, leftovers, shared); + auto contents = parse_object(src, shared); if (contents->type() != Vector::vector_sexp_type) { throw std::runtime_error("incorrectly typed contents in wrap_* ALTREP's payload"); } // Second cons value is the wrapping metadata, we don't care about it. - auto metaheader = parse_header(reader, leftovers); + auto metaheader = parse_header(src); if (metaheader[3] != static_cast(SEXPType::INT)) { throw std::runtime_error("wrap_* ALTREP should have an integer vector for its metadata"); } - auto metadata = parse_integer_body(reader, leftovers); + auto metadata = parse_integer_body(src); if (metadata.data.size() != 2) { throw std::runtime_error("wrap_* ALTREP's metadata should be a length-2 integer vector"); } // Now we can finally get the attributes, which makes up the rest of the pairlist. auto coerced = static_cast(contents.get()); - auto attrheader = parse_header(reader, leftovers); + auto attrheader = parse_header(src); if (attrheader[3] == static_cast(SEXPType::LIST)) { - parse_attributes_body(reader, leftovers, attrheader, coerced->attributes, shared); + parse_attributes_body(src, attrheader, coerced->attributes, shared); } else if (attrheader[3] != static_cast(SEXPType::NILVALUE_)) { throw std::runtime_error("wrap_* ALTREP's attributes should be a pairlist or NULL"); } @@ -98,15 +98,15 @@ Vector parse_attribute_wrapper(Reader& reader, std::vector& lefto throw traceback("failed to parse attribute-wrapped ALTREP", e); } -template -StringVector parse_deferred_string(Reader& reader, std::vector& leftovers, SharedParseInfo& shared) try { - auto plist_header = parse_header(reader, leftovers); +template +StringVector parse_deferred_string(Source_& src, SharedParseInfo& shared) try { + auto plist_header = parse_header(src); if (plist_header[3] != static_cast(SEXPType::LIST)) { throw std::runtime_error("expected pairlist in deferred_string ALTREP's payload"); } // First pairlist element is a CONS cell where the first value is the thing to be converted. - auto contents = parse_object(reader, leftovers, shared); + auto contents = parse_object(src, shared); StringVector output; if (contents->type() == SEXPType::INT){ @@ -160,18 +160,18 @@ StringVector parse_deferred_string(Reader& reader, std::vector& l } // Second cons value is the wrapping metadata, we don't care about it. - auto metaheader = parse_header(reader, leftovers); + auto metaheader = parse_header(src); if (metaheader[3] != static_cast(SEXPType::INT)) { throw std::runtime_error("deferred_string ALTREP should have an integer vector for its metadata"); } - auto metadata = parse_integer_body(reader, leftovers); + auto metadata = parse_integer_body(src); if (metadata.data.size() != 1) { throw std::runtime_error("deferred_string ALTREP's metadata should be a length-1 integer vector"); } // Chomp up the null. - auto terminator = parse_header(reader, leftovers); + auto terminator = parse_header(src); if (terminator[3] != static_cast(SEXPType::NILVALUE_)) { throw std::runtime_error("failed to terminate a deferred string ALTREP correctly"); } @@ -183,14 +183,14 @@ StringVector parse_deferred_string(Reader& reader, std::vector& l } -template -std::unique_ptr parse_altrep_body(Reader& reader, std::vector& leftovers, SharedParseInfo& shared) try { - auto header = parse_header(reader, leftovers); +template +std::unique_ptr parse_altrep_body(Source_& src, SharedParseInfo& shared) try { + auto header = parse_header(src); if (header[3] != static_cast(SEXPType::LIST)) { throw std::runtime_error("expected ALTREP description to be a pairlist"); } - auto plist = parse_pairlist_body(reader, leftovers, header, shared); + auto plist = parse_pairlist_body(src, header, shared); if (plist.data.size() < 1 || plist.data[0]->type() != SEXPType::SYM) { throw std::runtime_error("expected type specification symbol in the ALTREP description"); } @@ -204,11 +204,11 @@ std::unique_ptr parse_altrep_body(Reader& reader, std::vectorindex]; if (symb.name == "wrap_integer") { - pointerize_(altrep_internal::parse_attribute_wrapper(reader, leftovers, shared)); + pointerize_(altrep_internal::parse_attribute_wrapper(src, shared)); } else if (symb.name == "compact_intseq") { - pointerize_(altrep_internal::parse_numeric_compact_seq(reader, leftovers)); + pointerize_(altrep_internal::parse_numeric_compact_seq(src)); } else if (symb.name == "deferred_string") { - pointerize_(altrep_internal::parse_deferred_string(reader, leftovers, shared)); + pointerize_(altrep_internal::parse_deferred_string(src, shared)); } else { throw std::runtime_error("unrecognized ALTREP type '" + symb.name + "'"); } diff --git a/include/rds2cpp/parse_atomic.hpp b/include/rds2cpp/parse_atomic.hpp index de322fe..d7b5485 100644 --- a/include/rds2cpp/parse_atomic.hpp +++ b/include/rds2cpp/parse_atomic.hpp @@ -13,18 +13,22 @@ namespace rds2cpp { namespace atomic_internal { -template -Vector parse_integer_or_logical_body(Reader& reader, std::vector& leftovers) { - size_t len = get_length(reader, leftovers); +template +Vector parse_integer_or_logical_body(Source_& src) { + size_t len = get_length(src); Vector output(len); constexpr size_t width = 4; + static_assert(width == sizeof(decltype(output.data[0]))); + size_t byte_length = width * len; + auto ptr = reinterpret_cast(output.data.data()); - extract_up_to(reader, leftovers, width * len, - [&](const unsigned char* buffer, size_t n, size_t i) -> void { - std::copy(buffer, buffer + n, ptr + i); + for (size_t i = 0; i < byte_length; ++i) { + if (!src.advance()) { + throw empty_error(); } - ); + ptr[i] = src.get(); + } // Flipping endianness. if (little_endian()) { @@ -39,32 +43,36 @@ Vector parse_integer_or_logical_body(Reader& reader, std::vector& } -template -IntegerVector parse_integer_body(Reader& reader, std::vector& leftovers) try { - return atomic_internal::parse_integer_or_logical_body(reader, leftovers); +template +IntegerVector parse_integer_body(Source_& src) try { + return atomic_internal::parse_integer_or_logical_body(src); } catch (std::exception& e) { throw traceback("failed to parse data for an integer vector", e); } -template -LogicalVector parse_logical_body(Reader& reader, std::vector& leftovers) try { - return atomic_internal::parse_integer_or_logical_body(reader, leftovers); +template +LogicalVector parse_logical_body(Source_& src) try { + return atomic_internal::parse_integer_or_logical_body(src); } catch (std::exception& e) { throw traceback("failed to parse data for a logical vector", e); } -template -DoubleVector parse_double_body(Reader& reader, std::vector& leftovers) try { - size_t len = get_length(reader, leftovers); +template +DoubleVector parse_double_body(Source_& src) try { + size_t len = get_length(src); DoubleVector output(len); constexpr size_t width = 8; + static_assert(width == sizeof(decltype(output.data[0]))); + size_t byte_length = width * len; + auto ptr = reinterpret_cast(output.data.data()); - extract_up_to(reader, leftovers, width * len, - [&](const unsigned char* buffer, size_t n, size_t i) -> void { - std::copy(buffer, buffer + n, ptr + i); + for (size_t i = 0; i < byte_length; ++i) { + if (!src.advance()) { + throw empty_error(); } - ); + ptr[i] = src.get(); + } // Flipping endianness. if (little_endian()) { @@ -79,41 +87,48 @@ DoubleVector parse_double_body(Reader& reader, std::vector& lefto throw traceback("failed to parse data for a double vector", e); } -template -RawVector parse_raw_body(Reader& reader, std::vector& leftovers) try { - size_t len = get_length(reader, leftovers); +template +RawVector parse_raw_body(Source_& src) try { + size_t len = get_length(src); RawVector output(len); auto ptr = reinterpret_cast(output.data.data()); - extract_up_to(reader, leftovers, len, - [&](const unsigned char* buffer, size_t n, size_t i) -> void { - std::copy(buffer, buffer + n, ptr + i); + for (size_t i = 0; i < len; ++i) { + if (!src.advance()) { + throw empty_error(); } - ); + ptr[i] = src.get(); + } return output; } catch (std::exception& e) { throw traceback("failed to parse data for a raw vector", e); } -template -ComplexVector parse_complex_body(Reader& reader, std::vector& leftovers) try { - size_t len = get_length(reader, leftovers); +template +ComplexVector parse_complex_body(Source_& src) try { + size_t len = get_length(src); ComplexVector output(len); constexpr size_t width = 16; + static_assert(width == sizeof(decltype(output.data[0]))); + size_t byte_length = width * len; + auto ptr = reinterpret_cast(output.data.data()); - extract_up_to(reader, leftovers, width * len, - [&](const unsigned char* buffer, size_t n, size_t i) -> void { - std::copy(buffer, buffer + n, ptr + i); + for (size_t b = 0; b < byte_length; ++b) { + if (!src.advance()) { + throw empty_error(); } - ); + ptr[b] = src.get(); + } // Flipping endianness for each double. if (little_endian()) { + constexpr size_t single_width = width / 2; + size_t single_length = len * 2; auto copy = ptr; - for (size_t n = 0; n < len * 2; ++n, copy += width / 2) { - std::reverse(copy, copy + width/2); + for (size_t n = 0; n < single_length; ++n, copy += single_width) { + std::reverse(copy, copy + single_width); } } @@ -122,12 +137,12 @@ ComplexVector parse_complex_body(Reader& reader, std::vector& lef throw traceback("failed to parse data for a complex vector", e); } -template -StringVector parse_string_body(Reader& reader, std::vector& leftovers) try { - size_t len = get_length(reader, leftovers); +template +StringVector parse_string_body(Source_& src) try { + size_t len = get_length(src); StringVector output(len); for (size_t i = 0; i < len; ++i) { - auto str = parse_single_string(reader, leftovers); + auto str = parse_single_string(src); output.data[i] = str.value; output.encodings[i] = str.encoding; output.missing[i] = str.missing; diff --git a/include/rds2cpp/parse_attributes.hpp b/include/rds2cpp/parse_attributes.hpp index 30265ee..648e5ba 100644 --- a/include/rds2cpp/parse_attributes.hpp +++ b/include/rds2cpp/parse_attributes.hpp @@ -10,16 +10,16 @@ namespace rds2cpp { -template -PairList parse_pairlist_body(Reader&, std::vector&, const Header&, SharedParseInfo&); +template +PairList parse_pairlist_body(Source_&, const Header&, SharedParseInfo&); inline bool has_attributes(const Header& header) { return (header[2] & 0x2); } -template -void parse_attributes_body(Reader& reader, std::vector& leftovers, const Header& header, Attributes& output, SharedParseInfo& shared) try { - auto plist = parse_pairlist_body(reader, leftovers, header, shared); +template +void parse_attributes_body(Source_& src, const Header& header, Attributes& output, SharedParseInfo& shared) try { + auto plist = parse_pairlist_body(src, header, shared); size_t nnodes = plist.data.size(); for (size_t t = 0; t < nnodes; ++t) { @@ -35,13 +35,13 @@ void parse_attributes_body(Reader& reader, std::vector& leftovers throw traceback("failed to parse attribute contents", e); } -template -void parse_attributes(Reader& reader, std::vector& leftovers, Attributes& output, SharedParseInfo& shared) try { - auto header = parse_header(reader, leftovers); +template +void parse_attributes(Source_& src, Attributes& output, SharedParseInfo& shared) try { + auto header = parse_header(src); if (header[3] != static_cast(SEXPType::LIST)) { throw std::runtime_error("attributes should be a pairlist"); } - parse_attributes_body(reader, leftovers, header, output, shared); + parse_attributes_body(src, header, output, shared); return; } catch (std::exception& e) { throw traceback("failed to parse attributes", e); diff --git a/include/rds2cpp/parse_builtin.hpp b/include/rds2cpp/parse_builtin.hpp index 11f0306..1f9e372 100644 --- a/include/rds2cpp/parse_builtin.hpp +++ b/include/rds2cpp/parse_builtin.hpp @@ -9,18 +9,19 @@ namespace rds2cpp { -template -BuiltInFunction parse_builtin_body(Reader& reader, std::vector& leftovers) try { - size_t len = get_length(reader, leftovers); +template +BuiltInFunction parse_builtin_body(Source_& src) try { + size_t len = get_length(src); BuiltInFunction output; - extract_up_to(reader, leftovers, len, - [&](const unsigned char* buffer, size_t n, size_t) -> void { - auto ptr = reinterpret_cast(buffer); - output.name.insert(output.name.end(), ptr, ptr + n); + output.name.resize(len); + for (size_t i = 0; i < len; ++i) { + if (!src.advance()) { + throw empty_error(); } - ); - + output.name[i] = src.get(); + } + return output; } catch(std::exception& e) { throw traceback("failed to parse built-in function body", e); diff --git a/include/rds2cpp/parse_environment.hpp b/include/rds2cpp/parse_environment.hpp index ef2f61c..fd59654 100644 --- a/include/rds2cpp/parse_environment.hpp +++ b/include/rds2cpp/parse_environment.hpp @@ -11,8 +11,8 @@ namespace rds2cpp { -template -PairList parse_pairlist_body(Reader&, std::vector&, SharedParseInfo&); +template +PairList parse_pairlist_body(Source_&, SharedParseInfo&); inline EnvironmentIndex parse_global_environment_body() { return EnvironmentIndex(SEXPType::GLOBALENV_); @@ -26,27 +26,31 @@ inline EnvironmentIndex parse_empty_environment_body() { return EnvironmentIndex(SEXPType::EMPTYENV_); } -template -EnvironmentIndex parse_new_environment_body(Reader& reader, std::vector& leftovers, SharedParseInfo& shared) try { +template +EnvironmentIndex parse_new_environment_body(Source_& src, SharedParseInfo& shared) try { // Need to provision the environment first, so that internal references are valid. size_t eindex = shared.request_environment(); Environment new_env; // Is it locked or not? uint32_t locked = 0; - extract_up_to(reader, leftovers, 4, [&](const unsigned char* buffer, size_t n, size_t) -> void { - for (size_t j = 0; j < n; ++j) { - locked <<= 8; - locked += buffer[j]; - } - }); + for (int i = 0; i < 4; ++i) { + if (!src.advance()) { + throw empty_error(); + } + locked <<= 8; + locked += src.get(); + } new_env.locked = (locked > 0); // The next 4 bytes describe the parent environment. std::array parent; - extract_up_to(reader, leftovers, 4, [&](const unsigned char* buffer, size_t n, size_t i) -> void { - std::copy(buffer, buffer + n, parent.data() + i); - }); + for (int i = 0; i < 4; ++i) { + if (!src.advance()) { + throw empty_error(); + } + parent[i] = src.get(); + } auto lastbit = parent[3]; if (lastbit == static_cast(SEXPType::REF)) { @@ -54,7 +58,7 @@ EnvironmentIndex parse_new_environment_body(Reader& reader, std::vector(SEXPType::ENV)) { - auto env = parse_new_environment_body(reader, leftovers, shared); + auto env = parse_new_environment_body(src, shared); new_env.parent = env.index; new_env.parent_type = SEXPType::ENV; @@ -74,9 +78,9 @@ EnvironmentIndex parse_new_environment_body(Reader& reader, std::vector(SEXPType::LIST)) { - auto plist = parse_pairlist_body(reader, leftovers, unhashed, shared); + auto plist = parse_pairlist_body(src, unhashed, shared); for (size_t i = 0; i < plist.data.size(); ++i) { if (!plist.has_tag[i]) { @@ -87,20 +91,20 @@ EnvironmentIndex parse_new_environment_body(Reader& reader, std::vector(SEXPType::NILVALUE_)) { throw std::runtime_error("unhashed environment should not contain a non-NULL hash table"); } } else if (unhashed[3] == static_cast(SEXPType::NILVALUE_)) { // The next part is the hash table. - auto hash_header = parse_header(reader, leftovers); + auto hash_header = parse_header(src); if (hash_header[3] != static_cast(SEXPType::VEC)) { throw std::runtime_error("environment's hash table should be a list"); } new_env.hashed = true; - auto vec = parse_list_body(reader, leftovers, shared); + auto vec = parse_list_body(src, shared); for (size_t i = 0; i < vec.data.size(); ++i) { if (vec.data[i]->type() == SEXPType::NIL) { continue; @@ -127,9 +131,9 @@ EnvironmentIndex parse_new_environment_body(Reader& reader, std::vector(SEXPType::LIST)) { - parse_attributes_body(reader, leftovers, attr_header, new_env.attributes, shared); + parse_attributes_body(src, attr_header, new_env.attributes, shared); } else if (attr_header[3] != static_cast(SEXPType::NILVALUE_)) { throw std::runtime_error("environment should be terminated by a null"); } diff --git a/include/rds2cpp/parse_expression.hpp b/include/rds2cpp/parse_expression.hpp index c3781da..1f182b2 100644 --- a/include/rds2cpp/parse_expression.hpp +++ b/include/rds2cpp/parse_expression.hpp @@ -10,15 +10,15 @@ namespace rds2cpp { -template -std::unique_ptr parse_object(Reader&, std::vector&, SharedParseInfo& shared); +template +std::unique_ptr parse_object(Source_&, SharedParseInfo& shared); -template -ExpressionVector parse_expression_body(Reader& reader, std::vector& leftovers, SharedParseInfo& shared) try { - size_t len = get_length(reader, leftovers); +template +ExpressionVector parse_expression_body(Source_& src, SharedParseInfo& shared) try { + size_t len = get_length(src); ExpressionVector output(len); for (size_t i = 0; i < len; ++i) { - output.data[i] = parse_object(reader, leftovers, shared); + output.data[i] = parse_object(src, shared); } return output; } catch (std::exception& e) { diff --git a/include/rds2cpp/parse_external_pointer.hpp b/include/rds2cpp/parse_external_pointer.hpp index 4b4ec49..e666ec0 100644 --- a/include/rds2cpp/parse_external_pointer.hpp +++ b/include/rds2cpp/parse_external_pointer.hpp @@ -12,18 +12,18 @@ namespace rds2cpp { -template -std::unique_ptr parse_object(Reader& reader, std::vector& leftovers, SharedParseInfo& shared); +template +std::unique_ptr parse_object(Source_& src, SharedParseInfo& shared); -template -ExternalPointerIndex parse_external_pointer_body(Reader& reader, std::vector& leftovers, const Header& header, SharedParseInfo& shared) try { +template +ExternalPointerIndex parse_external_pointer_body(Source_& src, const Header& header, SharedParseInfo& shared) try { auto idx = shared.request_external_pointer(); auto& extptr = shared.external_pointers[idx]; - extptr.protection = parse_object(reader, leftovers, shared); - extptr.tag = parse_object(reader, leftovers, shared); + extptr.protection = parse_object(src, shared); + extptr.tag = parse_object(src, shared); if (has_attributes(header)) { - parse_attributes(reader, leftovers, extptr.attributes, shared); + parse_attributes(src, extptr.attributes, shared); } return ExternalPointerIndex(idx); diff --git a/include/rds2cpp/parse_language.hpp b/include/rds2cpp/parse_language.hpp index 2144064..b440ba9 100644 --- a/include/rds2cpp/parse_language.hpp +++ b/include/rds2cpp/parse_language.hpp @@ -12,14 +12,14 @@ namespace rds2cpp { -template -PairList parse_pairlist_body(Reader&, std::vector&, const Header&, SharedParseInfo&); +template +PairList parse_pairlist_body(Source_&, const Header&, SharedParseInfo&); -template -LanguageObject parse_language_body(Reader& reader, std::vector& leftovers, const Header& header, SharedParseInfo& shared) try { +template +LanguageObject parse_language_body(Source_& src, const Header& header, SharedParseInfo& shared) try { LanguageObject output; - auto contents = parse_pairlist_body(reader, leftovers, header, shared); + auto contents = parse_pairlist_body(src, header, shared); output.attributes = std::move(contents.attributes); if (contents.has_tag.size() < 1) { diff --git a/include/rds2cpp/parse_list.hpp b/include/rds2cpp/parse_list.hpp index 1938f5c..84c1b11 100644 --- a/include/rds2cpp/parse_list.hpp +++ b/include/rds2cpp/parse_list.hpp @@ -10,16 +10,16 @@ namespace rds2cpp { -template -std::unique_ptr parse_object(Reader&, std::vector&, SharedParseInfo& shared); +template +std::unique_ptr parse_object(Source_&, SharedParseInfo& shared); -template -GenericVector parse_list_body(Reader& reader, std::vector& leftovers, SharedParseInfo& shared) try { - size_t len = get_length(reader, leftovers); +template +GenericVector parse_list_body(Source_& src, SharedParseInfo& shared) try { + size_t len = get_length(src); GenericVector output(len); for (size_t i = 0; i < len; ++i) { try { - output.data[i] = parse_object(reader, leftovers, shared); + output.data[i] = parse_object(src, shared); } catch (std::exception& e) { throw traceback("failed to parse list element " + std::to_string(i + 1), e); } diff --git a/include/rds2cpp/parse_object.hpp b/include/rds2cpp/parse_object.hpp index d6f8fef..2eac70f 100644 --- a/include/rds2cpp/parse_object.hpp +++ b/include/rds2cpp/parse_object.hpp @@ -26,9 +26,9 @@ namespace rds2cpp { -template -std::unique_ptr parse_object(Reader& reader, std::vector& leftovers, SharedParseInfo& shared) { - auto details = parse_header(reader, leftovers); +template +std::unique_ptr parse_object(Source_& src, SharedParseInfo& shared) { + auto details = parse_header(src); auto sexp_type = details[3]; std::unique_ptr output; @@ -45,25 +45,25 @@ std::unique_ptr parse_object(Reader& reader, std::vector }; if (sexp_type == static_cast(SEXPType::LIST)) { - pointerize_(parse_pairlist_body(reader, leftovers, details, shared)); + pointerize_(parse_pairlist_body(src, details, shared)); } else if (sexp_type == static_cast(SEXPType::SYM)) { - pointerize_(parse_symbol_body(reader, leftovers, shared)); + pointerize_(parse_symbol_body(src, shared)); } else if (sexp_type == static_cast(SEXPType::S4)) { - pointerize_(parse_s4_body(reader, leftovers, details, shared)); + pointerize_(parse_s4_body(src, details, shared)); } else if (sexp_type == static_cast(SEXPType::ALTREP_)) { - output = parse_altrep_body(reader, leftovers, shared); + output = parse_altrep_body(src, shared); } else if (sexp_type == static_cast(SEXPType::NIL) || sexp_type == static_cast(SEXPType::NILVALUE_)) { pointerize_(Null()); } else if (sexp_type == static_cast(SEXPType::ENV)) { - pointerize_(parse_new_environment_body(reader, leftovers, shared)); + pointerize_(parse_new_environment_body(src, shared)); } else if (sexp_type == static_cast(SEXPType::EXTPTR)) { - pointerize_(parse_external_pointer_body(reader, leftovers, details, shared)); + pointerize_(parse_external_pointer_body(src, details, shared)); } else if (sexp_type == static_cast(SEXPType::GLOBALENV_)) { pointerize_(parse_global_environment_body()); @@ -78,36 +78,36 @@ std::unique_ptr parse_object(Reader& reader, std::vector output = shared.resolve_reference(details); } else if (sexp_type == static_cast(SEXPType::BUILTIN)) { - pointerize_(parse_builtin_body(reader, leftovers)); + pointerize_(parse_builtin_body(src)); } else if (sexp_type == static_cast(SEXPType::LANG)) { - pointerize_(parse_language_body(reader, leftovers, details, shared)); + pointerize_(parse_language_body(src, details, shared)); } else { Attributes* attr = nullptr; if (sexp_type == static_cast(SEXPType::INT)) { - attr = pointerize_attr(parse_integer_body(reader, leftovers)); + attr = pointerize_attr(parse_integer_body(src)); } else if (sexp_type == static_cast(SEXPType::LGL)) { - attr = pointerize_attr(parse_logical_body(reader, leftovers)); + attr = pointerize_attr(parse_logical_body(src)); } else if (sexp_type == static_cast(SEXPType::RAW)) { - attr = pointerize_attr(parse_raw_body(reader, leftovers)); + attr = pointerize_attr(parse_raw_body(src)); } else if (sexp_type == static_cast(SEXPType::REAL)) { - attr = pointerize_attr(parse_double_body(reader, leftovers)); + attr = pointerize_attr(parse_double_body(src)); } else if (sexp_type == static_cast(SEXPType::CPLX)) { - attr = pointerize_attr(parse_complex_body(reader, leftovers)); + attr = pointerize_attr(parse_complex_body(src)); } else if (sexp_type == static_cast(SEXPType::STR)) { - attr = pointerize_attr(parse_string_body(reader, leftovers)); + attr = pointerize_attr(parse_string_body(src)); } else if (sexp_type == static_cast(SEXPType::VEC)) { - attr = pointerize_attr(parse_list_body(reader, leftovers, shared)); + attr = pointerize_attr(parse_list_body(src, shared)); } else if (sexp_type == static_cast(SEXPType::EXPR)) { - attr = pointerize_attr(parse_expression_body(reader, leftovers, shared)); + attr = pointerize_attr(parse_expression_body(src, shared)); } else { throw std::runtime_error("cannot read unknown (or unsupported) SEXP type " + std::to_string(static_cast(sexp_type))); } if (has_attributes(details) && attr) { - parse_attributes(reader, leftovers, *attr, shared); + parse_attributes(src, *attr, shared); } } diff --git a/include/rds2cpp/parse_pairlist.hpp b/include/rds2cpp/parse_pairlist.hpp index 596f1a6..e45b608 100644 --- a/include/rds2cpp/parse_pairlist.hpp +++ b/include/rds2cpp/parse_pairlist.hpp @@ -13,27 +13,27 @@ namespace rds2cpp { -template -std::unique_ptr parse_object(Reader&, std::vector&, SharedParseInfo&); +template +std::unique_ptr parse_object(Source_&, SharedParseInfo&); namespace pairlist_internal { -template -void recursive_parse(Reader& reader, std::vector& leftovers, PairList& output, const Header& header, SharedParseInfo& shared) { +template +void recursive_parse(Source_& src, PairList& output, const Header& header, SharedParseInfo& shared) { bool has_attr = header[2] & 0x2; bool has_tag = header[2] & 0x4; if (has_attr) { - parse_attributes(reader, leftovers, output.attributes, shared); + parse_attributes(src, output.attributes, shared); } output.has_tag.push_back(has_tag); if (has_tag) { - auto header = parse_header(reader, leftovers); + auto header = parse_header(src); size_t sindex; if (header[3] == static_cast(SEXPType::SYM)) { - auto sdx = parse_symbol_body(reader, leftovers, shared); + auto sdx = parse_symbol_body(src, shared); sindex = sdx.index; } else if (header[3] == static_cast(SEXPType::REF)) { sindex = shared.get_symbol_index(header); @@ -51,7 +51,7 @@ void recursive_parse(Reader& reader, std::vector& leftovers, Pair } try { - output.data.push_back(parse_object(reader, leftovers, shared)); + output.data.push_back(parse_object(src, shared)); } catch (std::exception& e) { if (output.tag_names.back().empty()) { throw traceback("failed to parse unnamed pairlist element " + std::to_string(output.tag_names.size()), e); @@ -60,23 +60,23 @@ void recursive_parse(Reader& reader, std::vector& leftovers, Pair } } - auto next_header = parse_header(reader, leftovers); + auto next_header = parse_header(src); if (next_header[3] == static_cast(SEXPType::NILVALUE_)) { return; } else if (next_header[3] != static_cast(SEXPType::LIST)) { throw std::runtime_error("expected a terminator or the next pairlist node"); } - recursive_parse(reader, leftovers, output, next_header, shared); + recursive_parse(src, output, next_header, shared); return; } } -template -PairList parse_pairlist_body(Reader& reader, std::vector& leftovers, const Header& header, SharedParseInfo& shared) try { +template +PairList parse_pairlist_body(Source_& src, const Header& header, SharedParseInfo& shared) try { PairList output; - pairlist_internal::recursive_parse(reader, leftovers, output, header, shared); + pairlist_internal::recursive_parse(src, output, header, shared); return output; } catch (std::exception& e) { throw traceback("failed to parse a pairlist body", e); diff --git a/include/rds2cpp/parse_rds.hpp b/include/rds2cpp/parse_rds.hpp index bd8b761..cae9b53 100644 --- a/include/rds2cpp/parse_rds.hpp +++ b/include/rds2cpp/parse_rds.hpp @@ -10,7 +10,7 @@ #include "SharedParseInfo.hpp" #include "parse_object.hpp" -#include "byteme/SomeFileReader.hpp" +#include "byteme/byteme.hpp" /** * @file parse_rds.hpp @@ -23,49 +23,86 @@ namespace rds2cpp { /** * Parse the contents of an RDS file. * - * @tparam Reader A [`byteme::Reader`](https://ltla.github.io/byteme) class. + * @tparam parallel_ Whether to read and parse the file in parallel. + * @tparam Reader_ A [`byteme::Reader`](https://ltla.github.io/byteme) class, or any class with a compatible interface. * * @param reader Instance of a `Reader` class, containing the contents of the RDS file. * * @return An `RdsFile` object containing the contents of the RDS file. */ -template -RdsFile parse_rds(Reader& reader) { - RdsFile output(false); +template +RdsFile parse_rds(Reader_& reader) { + typename std::conditional, + byteme::PerByteParallel + >::type src(&reader); - std::vector leftovers; + RdsFile output(false); - // Reading the header first. + // Reading the header first. This is the first and only time that + // we need to do a src.valid() check, as we're using the current + // position of the source; in all other cases, it can be assumed + // that the source needs to be advance()'d before get(). { - std::vector accumulated; try { - extract_up_to(reader, leftovers, 14, - [&](const unsigned char* buffer, size_t n, size_t) -> void { - accumulated.insert(accumulated.end(), buffer, buffer + n); - } - ); + if (!src.valid()) { + throw empty_error(); + } + if (src.get() != 'X') { + throw std::runtime_error("only RDS files in XDR format are currently supported"); + } + + if (!src.advance()) { + throw empty_error(); + } + if (src.get() != '\n') { + throw std::runtime_error("only RDS files in XDR format are currently supported"); + } } catch (std::exception& e) { throw traceback("failed to read the header from the RDS preamble", e); } - if (static_cast(accumulated[0]) != 'X' && static_cast(accumulated[1]) != '\n') { - throw std::runtime_error("only RDS files in XDR format are currently supported"); - } - output.format_version = 0; - for (size_t pos = 2; pos < 6; ++pos) { - output.format_version <<= 8; - output.format_version += accumulated[pos]; - } + try { + for (int i = 0; i < 4; ++i) { + if (!src.advance()) { + throw empty_error(); + } + output.format_version <<= 8; + output.format_version += src.get(); + } + } catch (std::exception& e) { + throw traceback("failed to read the format version number from the RDS preamble", e); + } - // Just skipping the first byte for the versions... unless we get up - // to a major version > 255, then we're in trouble. - for (size_t pos = 7; pos < 10; ++pos) { - output.writer_version[pos - 7] = accumulated[pos]; + // Just skipping the first byte for the R reader/writer versions... + // unless we get up to a major version > 255, then we're in trouble. + try { + if (!src.advance()) { + throw empty_error(); + } + for (int pos = 0; pos < 3; ++pos) { + if (!src.advance()) { + throw empty_error(); + } + output.writer_version[pos] = src.get(); + } + } catch (std::exception& e) { + throw traceback("failed to read the writer version number from the RDS preamble", e); } - for (size_t pos = 11; pos < 14; ++pos) { - output.reader_version[pos - 11] = accumulated[pos]; + try { + if (!src.advance()) { + throw empty_error(); + } + for (int pos = 0; pos < 3; ++pos) { + if (!src.advance()) { + throw empty_error(); + } + output.reader_version[pos] = src.get(); + } + } catch (std::exception& e) { + throw traceback("failed to read the reader version number from the RDS preamble", e); } } @@ -73,24 +110,25 @@ RdsFile parse_rds(Reader& reader) { { size_t encoding_length = 0; try { - extract_up_to(reader, leftovers, 4, - [&](const unsigned char* buffer, size_t n, size_t) -> void { - for (size_t b = 0; b < n; ++b) { - encoding_length <<= 8; - encoding_length += buffer[b]; - } + for (int b = 0; b < 4; ++b) { + if (!src.advance()) { + throw empty_error(); } - ); + encoding_length <<= 8; + encoding_length += src.get(); + } } catch (std::exception& e) { throw traceback("failed to read the encoding length from the RDS preamble", e); } try { - extract_up_to(reader, leftovers, encoding_length, - [&](const unsigned char* buffer, size_t n, size_t) -> void { - output.encoding.insert(output.encoding.end(), buffer, buffer + n); + output.encoding.reserve(encoding_length); + for (size_t b = 0; b < encoding_length; ++b) { + if (!src.advance()) { + throw empty_error(); } - ); + output.encoding.push_back(src.get()); + } } catch (std::exception& e) { throw traceback("failed to read the encoding string from the RDS preamble", e); } @@ -98,7 +136,7 @@ RdsFile parse_rds(Reader& reader) { // Now we can finally read the damn object. SharedParseInfo shared; - output.object = parse_object(reader, leftovers, shared); + output.object = parse_object(src, shared); output.environments = std::move(shared.environments); output.symbols = std::move(shared.symbols); output.external_pointers = std::move(shared.external_pointers); @@ -109,13 +147,15 @@ RdsFile parse_rds(Reader& reader) { /** * Parse the contents of an RDS file. * + * @tparam parallel_ Whether to read and parse the file in parallel. * @param file Path to an RDS file. * * @return An `RdsFile` object containing the contents of `file`. */ -inline RdsFile parse_rds(std::string file) { +template +RdsFile parse_rds(std::string file) { byteme::SomeFileReader reader(file.c_str()); - return parse_rds(reader); + return parse_rds(reader); } /** diff --git a/include/rds2cpp/parse_s4.hpp b/include/rds2cpp/parse_s4.hpp index a3189e4..0537b2f 100644 --- a/include/rds2cpp/parse_s4.hpp +++ b/include/rds2cpp/parse_s4.hpp @@ -10,26 +10,26 @@ namespace rds2cpp { -template -std::unique_ptr parse_object(Reader&, std::vector&, SharedParseInfo&); +template +std::unique_ptr parse_object(Source_&, SharedParseInfo&); -template -PairList parse_pairlist_body(Reader&, std::vector&, const Header&, SharedParseInfo&); +template +PairList parse_pairlist_body(Source_&, const Header&, SharedParseInfo&); -template -S4Object parse_s4_body(Reader& reader, std::vector& leftovers, const Header& header, SharedParseInfo& shared) try { +template +S4Object parse_s4_body(Source_& src, const Header& header, SharedParseInfo& shared) try { if (!(header[2] & 0x2) || !(header[2] & 0x1) || !(header[1] & 0x1)) { throw std::runtime_error("S4 objects should have object, attribute, and gp-S4 bits set in header"); } S4Object output; - auto slot_header = parse_header(reader, leftovers); + auto slot_header = parse_header(src); if (slot_header[3] != static_cast(SEXPType::LIST)) { throw std::runtime_error("slots of an S4 object should be stored as a pairlist"); } - auto slot_plist = parse_pairlist_body(reader, leftovers, slot_header, shared); + auto slot_plist = parse_pairlist_body(src, slot_header, shared); size_t nslots = slot_plist.data.size(); bool found_class = false; diff --git a/include/rds2cpp/parse_single_string.hpp b/include/rds2cpp/parse_single_string.hpp index bee4b50..63866c1 100644 --- a/include/rds2cpp/parse_single_string.hpp +++ b/include/rds2cpp/parse_single_string.hpp @@ -14,37 +14,37 @@ struct StringInfo { bool missing; }; -template -StringInfo parse_single_string(Reader& reader, std::vector& leftovers) try { - auto header = parse_header(reader, leftovers); +template +StringInfo parse_single_string(Source_& src) try { + auto header = parse_header(src); std::reverse(header.begin(), header.end()); // make it little-endian for easier indexing. if (header[0] != static_cast(SEXPType::CHAR)) { throw std::runtime_error("expected a CHARSXP representation for a string"); } // Getting the string length; all strings are less than 2^31-1, - // see https://cran.r-project.org/doc/manuals/r-release/R-ints.html#Long-vectors. - uint32_t strlen = 0; - extract_up_to(reader, leftovers, 4, - [&](const unsigned char* buffer, size_t n, size_t) -> void { - for (size_t x = 0; x < n; ++x) { - strlen <<= 8; - strlen += buffer[x]; - } + // see https://cran.r-project.org/doc/manuals/r-release/R-ints.html#Long-vectors + size_t strlen = 0; + for (int i = 0; i < 4; ++i) { + if (!src.advance()) { + throw empty_error(); } - ); + strlen <<= 8; + strlen += src.get(); + } StringInfo output; output.missing = (strlen == static_cast(-1)); if (!output.missing) { auto& str = output.value; - extract_up_to(reader, leftovers, strlen, - [&](const unsigned char* buffer, size_t n, size_t) -> void { - auto ptr = reinterpret_cast(buffer); - str.insert(str.end(), ptr, ptr + n); + str.resize(strlen); + for (size_t i = 0; i < strlen; ++i) { + if (!src.advance()) { + throw empty_error(); } - ); + str[i] = src.get(); + } /* String encoding is stored in the gp field, from bits 12 to 27 in the header. * We make life easier by just accessing the relevant byte below, after adjusting diff --git a/include/rds2cpp/parse_symbol.hpp b/include/rds2cpp/parse_symbol.hpp index 2cf4a72..a7ccdf1 100644 --- a/include/rds2cpp/parse_symbol.hpp +++ b/include/rds2cpp/parse_symbol.hpp @@ -13,8 +13,8 @@ namespace rds2cpp { template -SymbolIndex parse_symbol_body(Reader& reader, std::vector& leftovers, SharedParseInfo& shared) try { - auto str = parse_single_string(reader, leftovers); +SymbolIndex parse_symbol_body(Reader& reader, SharedParseInfo& shared) try { + auto str = parse_single_string(reader); if (str.missing) { throw new std::runtime_error("expected a non-missing string for a symbol"); } diff --git a/include/rds2cpp/utils_parse.hpp b/include/rds2cpp/utils_parse.hpp index ca726c0..a781701 100644 --- a/include/rds2cpp/utils_parse.hpp +++ b/include/rds2cpp/utils_parse.hpp @@ -8,49 +8,25 @@ namespace rds2cpp { -template -void extract_up_to(Reader& reader, std::vector& leftovers, size_t expected, Function fun) { - size_t processed = std::min(expected, leftovers.size()); - fun(leftovers.data(), processed, 0); - std::copy(leftovers.begin() + processed, leftovers.end(), leftovers.begin()); - leftovers.resize(leftovers.size() - processed); - - while (processed < expected) { - if (!reader.load()) { - throw std::runtime_error("no more bytes to read"); - } - - const unsigned char * buffer = reader.buffer(); - size_t available = reader.available(); - - size_t required = expected - processed; - size_t usable = std::min(required, available); - fun(buffer, usable, processed); - processed += usable; - - if (processed == expected) { - leftovers.insert(leftovers.end(), buffer + usable, buffer + available); - break; - } - } -} - inline std::runtime_error traceback(std::string base, const std::exception& e) { return std::runtime_error(base + "\n - " + e.what()); } -template -size_t get_length(Reader& reader, std::vector& leftovers) { +inline std::runtime_error empty_error() { + return std::runtime_error("no more bytes to read"); +} + +template +size_t get_length(Source_& src) { uint32_t initial = 0; try { - extract_up_to(reader, leftovers, 4, - [&](const unsigned char* buffer, size_t n, size_t) -> void { - for (size_t b = 0; b < n; ++b) { - initial <<= 8; - initial += buffer[b]; - } + for (int b = 0; b < 4; ++b) { + if (!src.advance()) { + throw empty_error(); } - ); + initial <<= 8; + initial += src.get(); + } } catch (std::exception& e) { throw traceback("failed to extract vector length", e); } @@ -62,14 +38,13 @@ size_t get_length(Reader& reader, std::vector& leftovers) { // Hack to deal with large lengths. uint64_t full = 0; try { - extract_up_to(reader, leftovers, 8, - [&](const unsigned char* buffer, size_t n, size_t) -> void { - for (size_t b = 0; b < n; ++b) { - full <<= 8; - full += buffer[b]; - } + for (size_t b = 0; b < 8; ++b) { + if (!src.advance()) { + throw empty_error(); } - ); + full <<= 8; + full += src.get(); + } } catch (std::exception& e) { throw traceback("failed to extract large vector length", e); } @@ -85,16 +60,16 @@ inline bool little_endian() { typedef std::array Header; -template -Header parse_header(Reader& reader, std::vector& leftovers) try { +template +Header parse_header(Source_& src) try { Header details; - extract_up_to(reader, leftovers, 4, - [&](const unsigned char* buffer, size_t n, size_t i) -> void { - for (size_t b = 0; b < n; ++b, ++i) { - details[i] = buffer[b]; - } + int i = 0; + for (int b = 0; b < 4; ++b, ++i) { + if (!src.advance()) { + throw empty_error(); } - ); + details[i] = src.get(); + } return details; } catch (std::exception& e) { throw traceback("failed to parse the R object header", e); diff --git a/tests/R/RcppExports.R b/tests/R/RcppExports.R index cadaa95..f2243fe 100644 --- a/tests/R/RcppExports.R +++ b/tests/R/RcppExports.R @@ -6,6 +6,11 @@ parse <- function(file_name) { .Call('_rds2cpp_parse', PACKAGE = 'rds2cpp', file_name) } +#' @export +parallel_parse <- function(file_name) { + .Call('_rds2cpp_parallel_parse', PACKAGE = 'rds2cpp', file_name) +} + #' @export write <- function(x, file_name) { .Call('_rds2cpp_write', PACKAGE = 'rds2cpp', x, file_name) diff --git a/tests/src/RcppExports.cpp b/tests/src/RcppExports.cpp index 1ce18f0..87c3625 100644 --- a/tests/src/RcppExports.cpp +++ b/tests/src/RcppExports.cpp @@ -20,6 +20,16 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// parallel_parse +Rcpp::RObject parallel_parse(std::string file_name); +RcppExport SEXP _rds2cpp_parallel_parse(SEXP file_nameSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::traits::input_parameter< std::string >::type file_name(file_nameSEXP); + rcpp_result_gen = Rcpp::wrap(parallel_parse(file_name)); + return rcpp_result_gen; +END_RCPP +} // write Rcpp::RObject write(Rcpp::RObject x, std::string file_name); RcppExport SEXP _rds2cpp_write(SEXP xSEXP, SEXP file_nameSEXP) { @@ -34,6 +44,7 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_rds2cpp_parse", (DL_FUNC) &_rds2cpp_parse, 1}, + {"_rds2cpp_parallel_parse", (DL_FUNC) &_rds2cpp_parallel_parse, 1}, {"_rds2cpp_write", (DL_FUNC) &_rds2cpp_write, 2}, {NULL, NULL, 0} }; diff --git a/tests/src/parse.cpp b/tests/src/parse.cpp index 7eccb47..e4ac683 100644 --- a/tests/src/parse.cpp +++ b/tests/src/parse.cpp @@ -185,10 +185,7 @@ Rcpp::RObject convert(const rds2cpp::RObject* input) { return R_NilValue; } -//' @export -//[[Rcpp::export(rng=false)]] -Rcpp::RObject parse(std::string file_name) { - auto output = rds2cpp::parse_rds(file_name); +Rcpp::RObject parse_output(const rds2cpp::RdsFile& output) { if (output.object == nullptr) { return R_NilValue; } @@ -260,3 +257,17 @@ Rcpp::RObject parse(std::string file_name) { Rcpp::Named("external_pointers") = all_exts ); } + +//' @export +//[[Rcpp::export(rng=false)]] +Rcpp::RObject parse(std::string file_name) { + auto output = rds2cpp::parse_rds(file_name); + return parse_output(output); +} + +//' @export +//[[Rcpp::export(rng=false)]] +Rcpp::RObject parallel_parse(std::string file_name) { + auto output = rds2cpp::parse_rds(file_name); + return parse_output(output); +} diff --git a/tests/tests/testthat/test-atomic.R b/tests/tests/testthat/test-atomic.R index 4e75e4f..6e003e4 100644 --- a/tests/tests/testthat/test-atomic.R +++ b/tests/tests/testthat/test-atomic.R @@ -21,6 +21,9 @@ test_that("integer vector loading works as expected", { saveRDS(y, file=tmp) roundtrip <- rds2cpp:::parse(tmp) expect_identical(roundtrip$value, y) + + roundtrip <- rds2cpp:::parallel_parse(tmp) + expect_identical(roundtrip$value, y) } })