Skip to content

Commit

Permalink
Improve parsing efficiency by switching to byteme::PerByte.
Browse files Browse the repository at this point in the history
This eliminates the need for the leftover buffer and avoids unnecessary copies
to shift the used bytes in the leftover buffer at every call to the (now
removed) extract_up_to. Such copying caused major perf degradations when
parsing complex objects with lots of nested structure but little actual data. 

In addition, we now have access to PerByteParallel, which allows us to 
read from disk and parse in parallel. This should further improve perf.
  • Loading branch information
LTLA authored Sep 4, 2023
1 parent 6ec74fa commit c27d493
Show file tree
Hide file tree
Showing 21 changed files with 360 additions and 295 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.24)

project(rds2cpp
VERSION 1.0.1
VERSION 1.1.0
DESCRIPTION "Standalone C++ library for reading RDS files"
LANGUAGES CXX)

Expand Down
70 changes: 35 additions & 35 deletions include/rds2cpp/parse_altrep.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,28 @@

namespace rds2cpp {

template<class Reader>
IntegerVector parse_integer_body(Reader&, std::vector<unsigned char>&);
template<class Source_>
IntegerVector parse_integer_body(Source_&);

template<class Reader>
DoubleVector parse_double_body(Reader& reader, std::vector<unsigned char>&);
template<class Source_>
DoubleVector parse_double_body(Source_& src);

template<class Reader>
std::unique_ptr<RObject> parse_object(Reader&, std::vector<unsigned char>&, SharedParseInfo&);
template<class Source_>
std::unique_ptr<RObject> parse_object(Source_&, SharedParseInfo&);

template<class Reader>
PairList parse_pairlist_body(Reader&, std::vector<unsigned char>&, const Header&, SharedParseInfo&);
template<class Source_>
PairList parse_pairlist_body(Source_&, const Header&, SharedParseInfo&);

namespace altrep_internal {

template<class Vector, class Reader>
Vector parse_numeric_compact_seq(Reader& reader, std::vector<unsigned char>& leftovers) try {
auto header = parse_header(reader, leftovers);
template<class Vector, class Source_>
Vector parse_numeric_compact_seq(Source_& src) try {
auto header = parse_header(src);
if (header[3] != static_cast<unsigned char>(SEXPType::REAL)) {
throw std::runtime_error("expected compact_seq to store sequence information in doubles");
}

auto info = parse_double_body(reader, leftovers);
auto info = parse_double_body(src);
const auto& ranges = info.data;
if (ranges.size() != 3) {
throw std::runtime_error("expected compact_seq's sequence information to be of length 3");
Expand All @@ -49,7 +49,7 @@ Vector parse_numeric_compact_seq(Reader& reader, std::vector<unsigned char>& lef
output.data[i] = start;
}

auto terminator = parse_header(reader, leftovers);
auto terminator = parse_header(src);
if (terminator[3] != 254) {
throw std::runtime_error("failed to terminate a compact_seq ALTREP correctly");
}
Expand All @@ -59,36 +59,36 @@ Vector parse_numeric_compact_seq(Reader& reader, std::vector<unsigned char>& lef
throw traceback("failed to parse compact numeric ALTREP", e);
}

template<class Vector, class Reader>
Vector parse_attribute_wrapper(Reader& reader, std::vector<unsigned char>& leftovers, SharedParseInfo& shared) try {
auto plist_header = parse_header(reader, leftovers);
template<class Vector, class Source_>
Vector parse_attribute_wrapper(Source_& src, SharedParseInfo& shared) try {
auto plist_header = parse_header(src);
if (plist_header[3] != static_cast<unsigned char>(SEXPType::LIST)) {
throw std::runtime_error("expected pairlist in wrap_* ALTREP's payload");
}

// First pairlist element is a CONS cell where the first value is the wrapped integer vector.

auto contents = parse_object(reader, leftovers, shared);
auto contents = parse_object(src, shared);
if (contents->type() != Vector::vector_sexp_type) {
throw std::runtime_error("incorrectly typed contents in wrap_* ALTREP's payload");
}

// Second cons value is the wrapping metadata, we don't care about it.
auto metaheader = parse_header(reader, leftovers);
auto metaheader = parse_header(src);
if (metaheader[3] != static_cast<unsigned char>(SEXPType::INT)) {
throw std::runtime_error("wrap_* ALTREP should have an integer vector for its metadata");
}

auto metadata = parse_integer_body(reader, leftovers);
auto metadata = parse_integer_body(src);
if (metadata.data.size() != 2) {
throw std::runtime_error("wrap_* ALTREP's metadata should be a length-2 integer vector");
}

// Now we can finally get the attributes, which makes up the rest of the pairlist.
auto coerced = static_cast<Vector*>(contents.get());
auto attrheader = parse_header(reader, leftovers);
auto attrheader = parse_header(src);
if (attrheader[3] == static_cast<unsigned>(SEXPType::LIST)) {
parse_attributes_body(reader, leftovers, attrheader, coerced->attributes, shared);
parse_attributes_body(src, attrheader, coerced->attributes, shared);
} else if (attrheader[3] != static_cast<unsigned>(SEXPType::NILVALUE_)) {
throw std::runtime_error("wrap_* ALTREP's attributes should be a pairlist or NULL");
}
Expand All @@ -98,15 +98,15 @@ Vector parse_attribute_wrapper(Reader& reader, std::vector<unsigned char>& lefto
throw traceback("failed to parse attribute-wrapped ALTREP", e);
}

template<class Reader>
StringVector parse_deferred_string(Reader& reader, std::vector<unsigned char>& leftovers, SharedParseInfo& shared) try {
auto plist_header = parse_header(reader, leftovers);
template<class Source_>
StringVector parse_deferred_string(Source_& src, SharedParseInfo& shared) try {
auto plist_header = parse_header(src);
if (plist_header[3] != static_cast<unsigned char>(SEXPType::LIST)) {
throw std::runtime_error("expected pairlist in deferred_string ALTREP's payload");
}

// First pairlist element is a CONS cell where the first value is the thing to be converted.
auto contents = parse_object(reader, leftovers, shared);
auto contents = parse_object(src, shared);
StringVector output;

if (contents->type() == SEXPType::INT){
Expand Down Expand Up @@ -160,18 +160,18 @@ StringVector parse_deferred_string(Reader& reader, std::vector<unsigned char>& l
}

// Second cons value is the wrapping metadata, we don't care about it.
auto metaheader = parse_header(reader, leftovers);
auto metaheader = parse_header(src);
if (metaheader[3] != static_cast<unsigned char>(SEXPType::INT)) {
throw std::runtime_error("deferred_string ALTREP should have an integer vector for its metadata");
}

auto metadata = parse_integer_body(reader, leftovers);
auto metadata = parse_integer_body(src);
if (metadata.data.size() != 1) {
throw std::runtime_error("deferred_string ALTREP's metadata should be a length-1 integer vector");
}

// Chomp up the null.
auto terminator = parse_header(reader, leftovers);
auto terminator = parse_header(src);
if (terminator[3] != static_cast<unsigned char>(SEXPType::NILVALUE_)) {
throw std::runtime_error("failed to terminate a deferred string ALTREP correctly");
}
Expand All @@ -183,14 +183,14 @@ StringVector parse_deferred_string(Reader& reader, std::vector<unsigned char>& l

}

template<class Reader>
std::unique_ptr<RObject> parse_altrep_body(Reader& reader, std::vector<unsigned char>& leftovers, SharedParseInfo& shared) try {
auto header = parse_header(reader, leftovers);
template<class Source_>
std::unique_ptr<RObject> parse_altrep_body(Source_& src, SharedParseInfo& shared) try {
auto header = parse_header(src);
if (header[3] != static_cast<unsigned char>(SEXPType::LIST)) {
throw std::runtime_error("expected ALTREP description to be a pairlist");
}

auto plist = parse_pairlist_body(reader, leftovers, header, shared);
auto plist = parse_pairlist_body(src, header, shared);
if (plist.data.size() < 1 || plist.data[0]->type() != SEXPType::SYM) {
throw std::runtime_error("expected type specification symbol in the ALTREP description");
}
Expand All @@ -204,11 +204,11 @@ std::unique_ptr<RObject> parse_altrep_body(Reader& reader, std::vector<unsigned
const auto& symb = shared.symbols[sdx->index];

if (symb.name == "wrap_integer") {
pointerize_(altrep_internal::parse_attribute_wrapper<IntegerVector>(reader, leftovers, shared));
pointerize_(altrep_internal::parse_attribute_wrapper<IntegerVector>(src, shared));
} else if (symb.name == "compact_intseq") {
pointerize_(altrep_internal::parse_numeric_compact_seq<IntegerVector>(reader, leftovers));
pointerize_(altrep_internal::parse_numeric_compact_seq<IntegerVector>(src));
} else if (symb.name == "deferred_string") {
pointerize_(altrep_internal::parse_deferred_string(reader, leftovers, shared));
pointerize_(altrep_internal::parse_deferred_string(src, shared));
} else {
throw std::runtime_error("unrecognized ALTREP type '" + symb.name + "'");
}
Expand Down
95 changes: 55 additions & 40 deletions include/rds2cpp/parse_atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,22 @@ namespace rds2cpp {

namespace atomic_internal {

template<class Vector, class Reader>
Vector parse_integer_or_logical_body(Reader& reader, std::vector<unsigned char>& leftovers) {
size_t len = get_length(reader, leftovers);
template<class Vector, class Source_>
Vector parse_integer_or_logical_body(Source_& src) {
size_t len = get_length(src);
Vector output(len);

constexpr size_t width = 4;
static_assert(width == sizeof(decltype(output.data[0])));
size_t byte_length = width * len;

auto ptr = reinterpret_cast<unsigned char*>(output.data.data());
extract_up_to(reader, leftovers, width * len,
[&](const unsigned char* buffer, size_t n, size_t i) -> void {
std::copy(buffer, buffer + n, ptr + i);
for (size_t i = 0; i < byte_length; ++i) {
if (!src.advance()) {
throw empty_error();
}
);
ptr[i] = src.get();
}

// Flipping endianness.
if (little_endian()) {
Expand All @@ -39,32 +43,36 @@ Vector parse_integer_or_logical_body(Reader& reader, std::vector<unsigned char>&

}

template<class Reader>
IntegerVector parse_integer_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
return atomic_internal::parse_integer_or_logical_body<IntegerVector>(reader, leftovers);
template<class Source_>
IntegerVector parse_integer_body(Source_& src) try {
return atomic_internal::parse_integer_or_logical_body<IntegerVector>(src);
} catch (std::exception& e) {
throw traceback("failed to parse data for an integer vector", e);
}

template<class Reader>
LogicalVector parse_logical_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
return atomic_internal::parse_integer_or_logical_body<LogicalVector>(reader, leftovers);
template<class Source_>
LogicalVector parse_logical_body(Source_& src) try {
return atomic_internal::parse_integer_or_logical_body<LogicalVector>(src);
} catch (std::exception& e) {
throw traceback("failed to parse data for a logical vector", e);
}

template<class Reader>
DoubleVector parse_double_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
size_t len = get_length(reader, leftovers);
template<class Source_>
DoubleVector parse_double_body(Source_& src) try {
size_t len = get_length(src);
DoubleVector output(len);

constexpr size_t width = 8;
static_assert(width == sizeof(decltype(output.data[0])));
size_t byte_length = width * len;

auto ptr = reinterpret_cast<unsigned char*>(output.data.data());
extract_up_to(reader, leftovers, width * len,
[&](const unsigned char* buffer, size_t n, size_t i) -> void {
std::copy(buffer, buffer + n, ptr + i);
for (size_t i = 0; i < byte_length; ++i) {
if (!src.advance()) {
throw empty_error();
}
);
ptr[i] = src.get();
}

// Flipping endianness.
if (little_endian()) {
Expand All @@ -79,41 +87,48 @@ DoubleVector parse_double_body(Reader& reader, std::vector<unsigned char>& lefto
throw traceback("failed to parse data for a double vector", e);
}

template<class Reader>
RawVector parse_raw_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
size_t len = get_length(reader, leftovers);
template<class Source_>
RawVector parse_raw_body(Source_& src) try {
size_t len = get_length(src);
RawVector output(len);

auto ptr = reinterpret_cast<unsigned char*>(output.data.data());
extract_up_to(reader, leftovers, len,
[&](const unsigned char* buffer, size_t n, size_t i) -> void {
std::copy(buffer, buffer + n, ptr + i);
for (size_t i = 0; i < len; ++i) {
if (!src.advance()) {
throw empty_error();
}
);
ptr[i] = src.get();
}

return output;
} catch (std::exception& e) {
throw traceback("failed to parse data for a raw vector", e);
}

template<class Reader>
ComplexVector parse_complex_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
size_t len = get_length(reader, leftovers);
template<class Source_>
ComplexVector parse_complex_body(Source_& src) try {
size_t len = get_length(src);
ComplexVector output(len);

constexpr size_t width = 16;
static_assert(width == sizeof(decltype(output.data[0])));
size_t byte_length = width * len;

auto ptr = reinterpret_cast<unsigned char*>(output.data.data());
extract_up_to(reader, leftovers, width * len,
[&](const unsigned char* buffer, size_t n, size_t i) -> void {
std::copy(buffer, buffer + n, ptr + i);
for (size_t b = 0; b < byte_length; ++b) {
if (!src.advance()) {
throw empty_error();
}
);
ptr[b] = src.get();
}

// Flipping endianness for each double.
if (little_endian()) {
constexpr size_t single_width = width / 2;
size_t single_length = len * 2;
auto copy = ptr;
for (size_t n = 0; n < len * 2; ++n, copy += width / 2) {
std::reverse(copy, copy + width/2);
for (size_t n = 0; n < single_length; ++n, copy += single_width) {
std::reverse(copy, copy + single_width);
}
}

Expand All @@ -122,12 +137,12 @@ ComplexVector parse_complex_body(Reader& reader, std::vector<unsigned char>& lef
throw traceback("failed to parse data for a complex vector", e);
}

template<class Reader>
StringVector parse_string_body(Reader& reader, std::vector<unsigned char>& leftovers) try {
size_t len = get_length(reader, leftovers);
template<class Source_>
StringVector parse_string_body(Source_& src) try {
size_t len = get_length(src);
StringVector output(len);
for (size_t i = 0; i < len; ++i) {
auto str = parse_single_string(reader, leftovers);
auto str = parse_single_string(src);
output.data[i] = str.value;
output.encodings[i] = str.encoding;
output.missing[i] = str.missing;
Expand Down
18 changes: 9 additions & 9 deletions include/rds2cpp/parse_attributes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@

namespace rds2cpp {

template<class Reader>
PairList parse_pairlist_body(Reader&, std::vector<unsigned char>&, const Header&, SharedParseInfo&);
template<class Source_>
PairList parse_pairlist_body(Source_&, const Header&, SharedParseInfo&);

inline bool has_attributes(const Header& header) {
return (header[2] & 0x2);
}

template<class Reader>
void parse_attributes_body(Reader& reader, std::vector<unsigned char>& leftovers, const Header& header, Attributes& output, SharedParseInfo& shared) try {
auto plist = parse_pairlist_body(reader, leftovers, header, shared);
template<class Source_>
void parse_attributes_body(Source_& src, const Header& header, Attributes& output, SharedParseInfo& shared) try {
auto plist = parse_pairlist_body(src, header, shared);

size_t nnodes = plist.data.size();
for (size_t t = 0; t < nnodes; ++t) {
Expand All @@ -35,13 +35,13 @@ void parse_attributes_body(Reader& reader, std::vector<unsigned char>& leftovers
throw traceback("failed to parse attribute contents", e);
}

template<class Reader>
void parse_attributes(Reader& reader, std::vector<unsigned char>& leftovers, Attributes& output, SharedParseInfo& shared) try {
auto header = parse_header(reader, leftovers);
template<class Source_>
void parse_attributes(Source_& src, Attributes& output, SharedParseInfo& shared) try {
auto header = parse_header(src);
if (header[3] != static_cast<unsigned>(SEXPType::LIST)) {
throw std::runtime_error("attributes should be a pairlist");
}
parse_attributes_body(reader, leftovers, header, output, shared);
parse_attributes_body(src, header, output, shared);
return;
} catch (std::exception& e) {
throw traceback("failed to parse attributes", e);
Expand Down
Loading

0 comments on commit c27d493

Please sign in to comment.