Skip to content

Commit

Permalink
Implement Buffer::extract_nested_buffers, switch back to auto_grow::i…
Browse files Browse the repository at this point in the history
…nternal

Turns out, auto_grow::yes causes significant runtime overhead due to all the memmoves.
  • Loading branch information
BenWiederhake committed Nov 26, 2023
1 parent 17dc80f commit a36b54c
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 59 deletions.
70 changes: 45 additions & 25 deletions examples/osmium_read_pbf_randomaccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@
#include <osmium/io/pbf_input_randomaccess.hpp>
#include <osmium/visitor.hpp>

static void describe_buffers(const std::vector<std::unique_ptr<osmium::memory::Buffer>>& buffers) {
size_t total_capacity = 0;
size_t total_written = 0;
for (const auto& buffer : buffers) {
total_capacity += buffer->capacity();
total_written += buffer->written();
}
std::cout << buffers.size() << " buffers, with total capacity " << total_capacity << " and total written " << total_written << std::endl;
}

int main(int argc, char* argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " OSMFILE\n";
Expand All @@ -29,40 +39,50 @@ int main(int argc, char* argv[]) {

osmium::io::PbfBlockIndexTable table {argv[1]};
std::cout << "File has " << table.block_starts().size() << " blocks." << std::endl;
auto buffer = table.binary_search_object(osmium::item_type::way, 40106791, osmium::io::read_meta::no);
std::cout << "buffer1 has capacity=" << buffer.capacity() << ", committed=" << buffer.committed() << ", written=" << buffer.written() << std::endl;
if (!buffer) {

// Insert "return 0;" here, if you want to measure only the index-building time.

auto buffers = table.binary_search_object(osmium::item_type::way, 40106791, osmium::io::read_meta::no);
std::cout << "buffers1 ";
describe_buffers(buffers);
if (buffers.empty()) {
std::cout << "File definitely doesn't contain way 40106791." << std::endl;
} else {
bool found = false;
size_t num_seen = 0;
for (auto it = buffer.begin<osmium::OSMObject>(); it != buffer.end<osmium::OSMObject>(); ++it) {
num_seen += 1;
if (it->type() == osmium::item_type::way && it->id() == 40106791) {
found = true;
std::cout << "File contains way 40106791!\n";
auto& way = static_cast<osmium::Way&>(*it);
std::cout << "Nodes are: ";
auto& nodes = way.nodes();
bool is_first = true;
for (auto& ref : nodes) {
if (is_first) {
is_first = false;
} else {
std::cout << ", ";
for (auto it = buffers.rbegin(); it != buffers.rend(); ++it) {
const auto& buffer = *it;
for (auto it = buffer->begin<osmium::OSMObject>(); it != buffer->end<osmium::OSMObject>(); ++it) {
num_seen += 1;
if (it->type() == osmium::item_type::way && it->id() == 40106791) {
found = true;
std::cout << "File contains way 40106791!\n";
auto& way = static_cast<osmium::Way&>(*it);
std::cout << "Nodes are: ";
auto& nodes = way.nodes();
bool is_first = true;
for (auto& ref : nodes) {
if (is_first) {
is_first = false;
} else {
std::cout << ", ";
}
std::cout << ref.ref();
}
std::cout << ref.ref();
std::cout << "." << std::endl;
break;
}
std::cout << "." << std::endl;
break;
}
if (found)
break;
}
if (found) {
} else {
std::cerr << "File contains a promising-looking block, but it does not contain way 40106791.\n";
std::cout << "Block contains " << num_seen << " objects.\n";
if (!found) {
std::cout << "File contains a promising-looking block, but it does not contain way 40106791.\n";
}
}
auto buffer2 = table.binary_search_object(osmium::item_type::relation, 123456, osmium::io::read_meta::no);
std::cout << "buffer2 has capacity=" << buffer2.capacity() << ", committed=" << buffer2.committed() << ", written=" << buffer2.written() << std::endl;
auto buffers2 = table.binary_search_object(osmium::item_type::relation, 123456, osmium::io::read_meta::no);
std::cout << "buffers2 ";
describe_buffers(buffers2);
}

35 changes: 23 additions & 12 deletions include/osmium/io/pbf_input_randomaccess.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ namespace osmium {
* @pre block_index must be a valid index into m_block_starts.
* @returns The decoded block
*/
osmium::memory::Buffer get_parsed_block(size_t block_index, const osmium::io::read_meta read_metadata) {
std::vector<std::unique_ptr<osmium::memory::Buffer>> get_parsed_block(size_t block_index, const osmium::io::read_meta read_metadata) {
/* Because we might need to read the block to update m_block_starts, *all* item types must be decoded. This should not be a problem anyway, because the block likely only contains items of the desired type, as items should be sorted first by type, then by ID. */
const osmium::osm_entity_bits::type read_types = osmium::osm_entity_bits::all;

Expand All @@ -311,17 +311,19 @@ namespace osmium {
/* auto_grow::internal leads to linked lists, and iterating it will be "accidentally quadratic".
* While this may be okay for linear scans, it makes it unnecessarily difficult to quickly check the first item in the buffer.
* Thus, override it to auto_grow::yes, which results in a single, huge, but *contiguous* buffer. */
osmium::io::detail::PBFDataBlobDecoder data_blob_parser{std::move(input_buffer), read_types, read_metadata, osmium::memory::Buffer::auto_grow::yes};
osmium::io::detail::PBFDataBlobDecoder data_blob_parser{std::move(input_buffer), read_types, read_metadata, osmium::memory::Buffer::auto_grow::internal};

std::vector<std::unique_ptr<osmium::memory::Buffer>> buffers = data_blob_parser().extract_nested_buffers();

osmium::memory::Buffer buffer = data_blob_parser();
if (!block_start.is_populated()) {
const osmium::memory::Buffer& buffer = *buffers.back();
auto it = buffer.begin<osmium::OSMObject>();
if (it != buffer.end<osmium::OSMObject>()) {
block_start.first_item_id_or_zero = it->id();
block_start.first_item_type_or_zero = it->type();
}
}
return buffer;
return buffers;
}

/**
Expand Down Expand Up @@ -424,7 +426,7 @@ namespace osmium {
* @pre The data must be sorted by type first, and object id second
* @returns The decoded block
*/
osmium::memory::Buffer binary_search_object(
std::vector<std::unique_ptr<osmium::memory::Buffer>> binary_search_object(
const osmium::item_type needle_item_type,
const osmium::object_id_type needle_item_id,
const osmium::io::read_meta read_metadata
Expand All @@ -437,7 +439,7 @@ namespace osmium {
while (end_search - begin_search >= 2) {
size_t middle_search = osmium::io::detail::binsearch_middle(begin_search, end_search);
assert(!m_block_starts[middle_search].is_populated());
osmium::memory::Buffer buffer = get_parsed_block(middle_search, read_metadata);
std::vector<std::unique_ptr<osmium::memory::Buffer>> buffers = get_parsed_block(middle_search, read_metadata);
assert(m_block_starts[middle_search].is_populated());
if (m_block_starts[middle_search].is_needle_definitely_before(needle_item_type, needle_item_id)) {
end_search = middle_search;
Expand All @@ -450,26 +452,35 @@ namespace osmium {
* TODO: Measure, and perhaps store also the *last* object type and ID in pbf_block_start.
* TODO: Measure, and perhaps discard the current block in favor of eager binary search.
* (This needs to be done in get_parsed_block.)
* As a convenience, only search the last buffer of the block. This means that it cannot always be determined if this block contains the needle.
*/
for (auto it = buffer.begin<osmium::OSMObject>(); it != buffer.end<osmium::OSMObject>(); ++it) {
const osmium::memory::Buffer& last_buffer = *buffers.front();
bool saw_less_in_last_buffer = false;
for (auto it = last_buffer.begin<osmium::OSMObject>(); it != last_buffer.end<osmium::OSMObject>(); ++it) {
int cmp = osmium::io::detail::compare_by_type_then_id(needle_item_type, needle_item_id, it->type(), it->id());
if (cmp == 0) {
/* Can abort the search here, since accidentally the answer was found.
* Note that because the last object ID is not cached, the next call for this exact type and ID will actually be slower, as it will do a more naive binary search. See the to-do above. */
return buffer;
return buffers;
}
if (cmp < 0) {
/* Can abort the search here, since the needle would have needed to appeared before the current item.
* Note that because the last object ID is not cached, the next call for this exact type and ID will actually be slower, as it will do a more naive binary search. See the to-do above. */
return osmium::memory::Buffer();
if (saw_less_in_last_buffer) {
/* Can abort the search here, since the needle would have needed to appeared before the current item.
* Note that because the last object ID is not cached, the next call for this exact type and ID will actually be slower, as it will do a more naive binary search. See the to-do above. */
return {};
} else {
/* Can abort the search here, since the needle is greater-equal to the first object in the first buffer of this block, and smaller than the first object in the last buffer. This means the needle cannot be in any other block. */
return buffers;
}
}
saw_less_in_last_buffer = true;
}
/* The buffer definitely does not contain the needle. */
begin_search = middle_search + 1;
assert(begin_search <= end_search);
}
if (begin_search == end_search) {
return osmium::memory::Buffer();
return {};
}
assert(begin_search == end_search - 1);
return get_parsed_block(begin_search, read_metadata);
Expand Down
25 changes: 25 additions & 0 deletions include/osmium/memory/buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ DEALINGS IN THE SOFTWARE.
#include <memory>
#include <stdexcept>
#include <utility>
#include <vector>

namespace osmium {

Expand Down Expand Up @@ -418,6 +419,30 @@ namespace osmium {
return std::move(buffer->m_next_buffer);
}

/**
* Return and remove all nested buffers in order, including this
* buffer. This avoids the "accidentally quadratic" overhead of
* get_last_nested(), at the cost of two allocations.
*
* Note that this means that the newest buffer is at the front,
* and the oldest buffer is at the back.
*
* @pre The buffer must be valid.
* @pre The buffer will be invalid.
*/
std::vector<std::unique_ptr<Buffer>> extract_nested_buffers() {
assert(m_data && "This must be a valid buffer");
std::vector<std::unique_ptr<Buffer>> all_buffers;
Buffer* container = new Buffer(std::move(*this));
all_buffers.push_back(std::unique_ptr<Buffer>(container));
while (container->has_nested_buffers()) {
Buffer* child = container->m_next_buffer.release();
all_buffers.push_back(std::unique_ptr<Buffer>(child));
container = child;
}
return all_buffers;
}

/**
* Mark currently written bytes in the buffer as committed.
*
Expand Down
53 changes: 31 additions & 22 deletions test/t/io/test_pbf_randomaccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,42 +61,48 @@ TEST_CASE("Can read planet quickly") {

SECTION("Can 'randomly' access the first block") {
auto middle_block = table.get_parsed_block(0, osmium::io::read_meta::no);
auto it = middle_block.begin<osmium::OSMObject>();
auto it = middle_block.front()->begin<osmium::OSMObject>();
REQUIRE(it->type() == osmium::item_type::node);
REQUIRE(it->id() == 1);
}

SECTION("Can randomly access some arbitrary block") {
auto middle_block = table.get_parsed_block(1234, osmium::io::read_meta::no);
auto it = middle_block.begin<osmium::OSMObject>();
auto it = middle_block.front()->begin<osmium::OSMObject>();
REQUIRE(it->type() == osmium::item_type::node);
REQUIRE(it->id() >= 100000000);
}

SECTION("Find a late node") {
auto buffer = table.binary_search_object(osmium::item_type::node, 5301526002, osmium::io::read_meta::no);
REQUIRE(buffer);
auto buffers = table.binary_search_object(osmium::item_type::node, 5301526002, osmium::io::read_meta::no);
REQUIRE(!buffers.empty());
bool found = false;
size_t num_seen = 0;
for (auto it = buffer.begin<osmium::OSMObject>(); it != buffer.end<osmium::OSMObject>(); ++it) {
num_seen += 1;
if (it->type() == osmium::item_type::node && it->id() == 5301526002) {
found = true;
break;
for (const auto& buffer : buffers) {
for (auto it = buffer->begin<osmium::OSMObject>(); it != buffer->end<osmium::OSMObject>(); ++it) {
if (it->type() == osmium::item_type::node && it->id() == 5301526002) {
found = true;
break;
}
}
if (found)
break;
}
REQUIRE(found);
}

SECTION("Find an early node") {
auto buffer = table.binary_search_object(osmium::item_type::node, 123, osmium::io::read_meta::no);
REQUIRE(buffer);
auto buffers = table.binary_search_object(osmium::item_type::node, 123, osmium::io::read_meta::no);
REQUIRE(!buffers.empty());
bool found = false;
for (auto it = buffer.begin<osmium::OSMObject>(); it != buffer.end<osmium::OSMObject>(); ++it) {
if (it->type() == osmium::item_type::node && it->id() == 123) {
found = true;
break;
for (const auto& buffer : buffers) {
for (auto it = buffer->begin<osmium::OSMObject>(); it != buffer->end<osmium::OSMObject>(); ++it) {
if (it->type() == osmium::item_type::node && it->id() == 123) {
found = true;
break;
}
}
if (found)
break;
}
REQUIRE(found);
}
Expand Down Expand Up @@ -140,11 +146,12 @@ TEST_CASE("Access only middle block of PBF files, and check binary search") {
REQUIRE(table.block_starts()[2].first_item_type_or_zero == osmium::item_type::undefined);
REQUIRE(!table.block_starts()[2].is_populated());
{
auto it = middle_block.begin<osmium::OSMObject>();
REQUIRE(middle_block.size() == 1);
auto it = middle_block[0]->begin<osmium::OSMObject>();
REQUIRE(it->id() == 20);
REQUIRE(it->type() == osmium::item_type::way);
++it;
REQUIRE(it == middle_block.end<osmium::OSMObject>());
REQUIRE(it == middle_block[0]->end<osmium::OSMObject>());
}
require_binary_search_result(table, osmium::item_type::node, 5, 0, 1, 0);
require_binary_search_result(table, osmium::item_type::node, 10, 0, 1, 0);
Expand All @@ -168,7 +175,8 @@ TEST_CASE("Access only middle block of PBF files, and check binary search") {
REQUIRE(table.block_starts()[2].first_item_id_or_zero == 0);
REQUIRE(table.block_starts()[2].first_item_type_or_zero == osmium::item_type::undefined);
{
auto it = first_block.begin<osmium::OSMObject>();
REQUIRE(first_block.size() == 1);
auto it = first_block[0]->begin<osmium::OSMObject>();
REQUIRE(it->id() == 10);
REQUIRE(it->type() == osmium::item_type::node);
++it;
Expand All @@ -184,7 +192,7 @@ TEST_CASE("Access only middle block of PBF files, and check binary search") {
REQUIRE(it->id() == 14);
REQUIRE(it->type() == osmium::item_type::node);
++it;
REQUIRE(it == first_block.end<osmium::OSMObject>());
REQUIRE(it == first_block[0]->end<osmium::OSMObject>());
}
require_binary_search_result(table, osmium::item_type::node, 5, 0, 0, 3);
require_binary_search_result(table, osmium::item_type::node, 10, 0, 1, 0);
Expand All @@ -208,7 +216,8 @@ TEST_CASE("Access only middle block of PBF files, and check binary search") {
REQUIRE(table.block_starts()[2].first_item_id_or_zero == 30);
REQUIRE(table.block_starts()[2].first_item_type_or_zero == osmium::item_type::relation);
{
auto it = last_block.begin<osmium::OSMObject>();
REQUIRE(last_block.size() == 1);
auto it = last_block[0]->begin<osmium::OSMObject>();
REQUIRE(it->id() == 30);
REQUIRE(it->type() == osmium::item_type::relation);
++it;
Expand All @@ -218,7 +227,7 @@ TEST_CASE("Access only middle block of PBF files, and check binary search") {
REQUIRE(it->id() == 32);
REQUIRE(it->type() == osmium::item_type::relation);
++it;
REQUIRE(it == last_block.end<osmium::OSMObject>());
REQUIRE(it == last_block[0]->end<osmium::OSMObject>());
}
require_binary_search_result(table, osmium::item_type::node, 5, 0, 0, 3);
require_binary_search_result(table, osmium::item_type::node, 10, 0, 1, 0);
Expand Down

0 comments on commit a36b54c

Please sign in to comment.