Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use BPlusTree to hold backlinks #6673

Merged
merged 6 commits into from
Jun 1, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
### Fixed
* <How do the end-user experience this issue? what was the impact?> ([#????](https://github.com/realm/realm-core/issues/????), since v?.?.?)
* Align dictionaries to Lists and Sets when they get cleared. ([#6205](https://github.com/realm/realm-core/issues/6205), since v10.4.0)
* If you have more than 8388606 links pointing to one specific object, the program will crash. ([#6577](https://github.com/realm/realm-core/issues/6577), since v6.0.0)

### Breaking changes
* Support for upgrading from Realm files produced by RealmCore v5.23.9 or earlier is no longer supported.
Expand Down
42 changes: 34 additions & 8 deletions src/realm/array_backlink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ void ArrayBacklink::nullify_fwd_links(size_t ndx, CascadeState& state)
else {
// There is more than one backlink - Iterate through them all
ref_type ref = to_ref(value);
Array backlink_list(m_alloc);
BPlusTree<int64_t> backlink_list(m_alloc);
backlink_list.init_from_ref(ref);

size_t sz = backlink_list.size();
Expand All @@ -83,16 +83,17 @@ void ArrayBacklink::add(size_t ndx, ObjKey key)
// When increasing the size of the backlink list from 1 to 2, we need to
// convert from the single non-ref column value representation, to a B+-tree
// representation.
Array backlink_list(m_alloc);
BPlusTree<int64_t> backlink_list(m_alloc);
if ((value & 1) != 0) {
// Create new column to hold backlinks
backlink_list.create(Array::type_Normal);
backlink_list.create();
set_as_ref(ndx, backlink_list.get_ref());
backlink_list.add(value >> 1);
}
else {
backlink_list.init_from_ref(to_ref(value));
backlink_list.set_parent(this, ndx);
backlink_list.split_if_needed();
}
backlink_list.add(key.value); // Throws
}
Expand All @@ -118,17 +119,18 @@ bool ArrayBacklink::remove(size_t ndx, ObjKey key)

// if there is a list of backlinks we have to find
// the right one and remove it.
Array backlink_list(m_alloc);
BPlusTree<int64_t> backlink_list(m_alloc);
backlink_list.init_from_ref(ref_type(value));
backlink_list.set_parent(this, ndx);
backlink_list.split_if_needed();

size_t last_ndx = backlink_list.size() - 1;
size_t backlink_ndx = backlink_list.find_first(key.value);
REALM_ASSERT_DEBUG(backlink_ndx != not_found);
if (backlink_ndx != not_found) {
if (backlink_ndx != last_ndx)
backlink_list.set(backlink_ndx, backlink_list.get(last_ndx));
backlink_list.truncate(last_ndx); // Throws
backlink_list.erase(last_ndx); // Throws
}

// If there is only one backlink left we can inline it as tagged value
Expand All @@ -146,7 +148,7 @@ void ArrayBacklink::erase(size_t ndx)
{
uint64_t value = Array::get(ndx);
if (value && (value & 1) == 0) {
Array::destroy(ref_type(value), m_alloc);
Array::destroy_deep(ref_type(value), m_alloc);
}
Array::erase(ndx);
}
Expand All @@ -166,7 +168,7 @@ size_t ArrayBacklink::get_backlink_count(size_t ndx) const

// return size of list
MemRef mem(ref_type(value), m_alloc);
return Array::get_size_from_header(mem.get_addr());
return BPlusTree<int64_t>::size_from_header(mem.get_addr());
}

ObjKey ArrayBacklink::get_backlink(size_t ndx, size_t index) const
Expand All @@ -181,7 +183,7 @@ ObjKey ArrayBacklink::get_backlink(size_t ndx, size_t index) const
return ObjKey(int64_t(value >> 1));
}

Array backlink_list(m_alloc);
BPlusTree<int64_t> backlink_list(m_alloc);
backlink_list.init_from_ref(ref_type(value));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this going to interpret the data correctly if it is a simple Array that hasn't been split yet?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. A BPlusTree only holding one leaf array, is just a leaf array.


REALM_ASSERT(index < backlink_list.size());
Expand Down Expand Up @@ -236,3 +238,27 @@ void ArrayBacklink::verify() const
}
#endif
}

bool ArrayBacklink::verify_backlink(size_t ndx, int64_t link)
{
#ifdef REALM_DEBUG
uint64_t value = Array::get(ndx);
if (value == 0)
return false;

// If there is only a single backlink, it can be stored as
// a tagged value
if ((value & 1) != 0) {
return int64_t(value >> 1) == link;
}

BPlusTree<int64_t> backlink_list(m_alloc);
backlink_list.init_from_ref(ref_type(value));

return backlink_list.find_first(link) != realm::not_found;
#else
static_cast<void>(ndx);
static_cast<void>(link);
return true;
#endif
}
1 change: 1 addition & 0 deletions src/realm/array_backlink.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ class ArrayBacklink : public ArrayPayload, private Array {
Array::truncate_and_destroy_children(0);
}
void verify() const;
bool verify_backlink(size_t ndx, int64_t link);
};
}

Expand Down
11 changes: 1 addition & 10 deletions src/realm/array_key.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,6 @@ void ArrayKeyBase<1>::verify() const

ConstTableRef target_table = origin_table->get_opposite_table(link_col_key);

auto verify_link = [origin_table, link_col_key](const Obj& target_obj, ObjKey origin_key) {
auto cnt = target_obj.get_backlink_count(*origin_table, link_col_key);
for (size_t i = 0; i < cnt; i++) {
if (target_obj.get_backlink(*origin_table, link_col_key, i) == origin_key)
return;
}
REALM_ASSERT(false);
};

// Verify that forward link has a corresponding backlink
for (size_t i = 0; i < size(); ++i) {
if (ObjKey target_key = get(i)) {
Expand All @@ -104,7 +95,7 @@ void ArrayKeyBase<1>::verify() const
auto target_obj = target_key.is_unresolved() ? target_table->try_get_tombstone(target_key)
: target_table->try_get_object(target_key);
REALM_ASSERT(target_obj);
verify_link(target_obj, origin_key);
target_obj.verify_backlink(*origin_table, link_col_key, origin_key);
}
}
#endif
Expand Down
75 changes: 75 additions & 0 deletions src/realm/bplustree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <realm/bplustree.hpp>
#include <realm/impl/destroy_guard.hpp>
#include <realm/array_unsigned.hpp>
#include <realm/array_integer.hpp>

using namespace realm;

Expand Down Expand Up @@ -130,6 +131,8 @@ class BPlusTreeInner : public BPlusTreeNode, private Array {
}
void ensure_offsets();

std::unique_ptr<BPlusTreeInner> split_root();

private:
ArrayUnsigned m_offsets;
size_t m_my_offset;
Expand Down Expand Up @@ -221,6 +224,37 @@ bool BPlusTreeLeaf::bptree_traverse(TraverseFunc func)
return func(this, 0) == IteratorControl::Stop;
}

template <>
void BPlusTree<int64_t>::split_root()
{
if (m_root->is_leaf()) {
auto sz = m_root->get_node_size();

LeafNode* leaf = static_cast<LeafNode*>(m_root.get());
auto new_root = std::make_unique<BPlusTreeInner>(this);

new_root->create(REALM_MAX_BPNODE_SIZE);

size_t ndx = 0;
while (ndx < sz) {
LeafNode new_leaf(this);
new_leaf.create();
size_t to_move = std::min(size_t(REALM_MAX_BPNODE_SIZE), sz - ndx);
for (size_t i = 0; i < to_move; i++, ndx++) {
new_leaf.insert(i, leaf->get(ndx));
}
new_root->add_bp_node_ref(new_leaf.get_ref()); // Throws
}
new_root->append_tree_size(sz);
leaf->destroy();
replace_root(std::move(new_root));
}
else {
BPlusTreeInner* inner = static_cast<BPlusTreeInner*>(m_root.get());
replace_root(inner->split_root());
}
}

/****************************** BPlusTreeInner *******************************/

BPlusTreeInner::BPlusTreeInner(BPlusTreeBase* tree)
Expand Down Expand Up @@ -514,6 +548,36 @@ void BPlusTreeInner::ensure_offsets()
}
}

std::unique_ptr<BPlusTreeInner> BPlusTreeInner::split_root()
{
auto new_root = std::make_unique<BPlusTreeInner>(m_tree);
auto sz = get_node_size();
size_t elems_per_child = get_elems_per_child();
new_root->create(REALM_MAX_BPNODE_SIZE * elems_per_child);
size_t ndx = 0;
size_t tree_size = get_tree_size();
size_t accumulated_size = 0;
while (ndx < sz) {
BPlusTreeInner new_inner(m_tree);
size_t to_move = std::min(size_t(REALM_MAX_BPNODE_SIZE), sz - ndx);
new_inner.create(elems_per_child);
for (size_t i = 0; i < to_move; i++, ndx++) {
new_inner.add_bp_node_ref(get_bp_node_ref(ndx));
}
size_t this_size = to_move * elems_per_child;
if (accumulated_size + this_size > tree_size) {
this_size = tree_size - accumulated_size;
}
accumulated_size += this_size;
new_inner.append_tree_size(this_size);
new_root->add_bp_node_ref(new_inner.get_ref()); // Throws
}
REALM_ASSERT(accumulated_size == tree_size);
new_root->append_tree_size(tree_size);
destroy();
return new_root;
}

inline BPlusTreeLeaf* BPlusTreeInner::cache_leaf(MemRef mem, size_t ndx, size_t offset)
{
BPlusTreeLeaf* leaf = m_tree->cache_leaf(mem);
Expand Down Expand Up @@ -769,3 +833,14 @@ std::unique_ptr<BPlusTreeNode> BPlusTreeBase::create_root_from_ref(ref_type ref)
return new_root;
}
}

size_t BPlusTreeBase::size_from_header(const char* header)
{
auto node_size = Array::get_size_from_header(header);
if (Array::get_is_inner_bptree_node_from_header(header)) {
auto data = Array::get_data_from_header(header);
auto width = Array::get_width_from_header(header);
node_size = size_t(get_direct(data, width, node_size - 1)) >> 1;
}
return node_size;
Comment on lines +839 to +845
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
auto node_size = Array::get_size_from_header(header);
if (Array::get_is_inner_bptree_node_from_header(header)) {
auto data = Array::get_data_from_header(header);
auto width = Array::get_width_from_header(header);
node_size = size_t(get_direct(data, width, node_size - 1)) >> 1;
}
return node_size;
if (Array::get_is_inner_bptree_node_from_header(header)) {
auto data = Array::get_data_from_header(header);
auto width = Array::get_width_from_header(header);
auto node_size = size_t(get_direct(data, width, node_size - 1));
REALM_ASSERT_EX(node_size >= 2, node_size);
return node_size >> 1;
}
return Array::get_size_from_header(header);

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, double checking because my memory of this is fuzzy, is the divide by two always correct for an inner node? Or is it possible that it is not in the compact form. Eg should we divide by get_elems_per_child()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is more a shift right than a divide by two. The size of the sub tree is always stored as the last entry and coded as a number - that is shifted one up and or-ed with 1.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not blocking, but you may want to use this suggestion to avoid an unnecessary read from the array header if looking at an inner node.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just don't think your suggesting would compile :-)

}
25 changes: 16 additions & 9 deletions src/realm/bplustree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ class BPlusTreeBase {
return m_size;
}

static size_t size_from_header(const char* header);

bool is_empty() const
{
return m_size == 0;
Expand All @@ -188,16 +190,12 @@ class BPlusTreeBase {

bool init_from_parent()
{
ref_type ref = m_parent->get_child_ref(m_ndx_in_parent);
if (!ref) {
return false;
;
if (ref_type ref = m_parent->get_child_ref(m_ndx_in_parent)) {
init_from_ref(ref);
return true;
}
auto new_root = create_root_from_ref(ref);
new_root->bp_set_parent(m_parent, m_ndx_in_parent);
m_root = std::move(new_root);
invalidate_leaf_cache();
m_size = m_root->get_tree_size();
return true;
return false;
}

void set_parent(ArrayParent* parent, size_t ndx_in_parent)
Expand Down Expand Up @@ -568,6 +566,13 @@ class BPlusTree : public BPlusTreeBase {
m_root->bptree_traverse(func);
}

void split_if_needed()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this needed? I am under the impression that BPlusTree::insert() will split automatically as needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the current node is just a plain array, then it should be split into a BPlusTree. I am open for another name - I struggled a bit myself to find one.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is an optimization that defers creating a tree until we reach more than 1000 links and then does the transformation on the fly rather than in the migration function of the file format. If that is correct, we should keep current performance the same for fewer than 1000 backlinks?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the size is below 1000, then a BPlusTree is just a plain array, so no need for any transformation. But the BPlusTree implementation is just slower than a plain array. I did not want to create two code paths - one for sizes under 1000 and one for sizes over.

{
while (m_root->get_node_size() > REALM_MAX_BPNODE_SIZE) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please test more than one iteration of this loop eg. more than 1000 * 1000 elements. I think the second iteration will fail because split_root() assumes that the root is a LeafNode but on the second pass they will be all BPlusTreeInner types

Copy link
Contributor Author

@jedelbo jedelbo May 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is tested more thoroughly when the node size is 4. get_node_size() will return the number of child elements - no matter if this is an inner node or a leaf.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, you had the implementation correct all along and I misread it. Thanks for adding the extra tests though!

split_root();
}
}

protected:
LeafNode m_leaf_cache;

Expand Down Expand Up @@ -606,6 +611,8 @@ class BPlusTree : public BPlusTreeBase {

template <class R>
friend R bptree_sum(const BPlusTree<T>& tree);

void split_root();
};

template <class T>
Expand Down
2 changes: 2 additions & 0 deletions src/realm/group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,8 @@ class Group : public ArrayParent {
/// 23 Layout of Set and Dictionary changed.
///
/// 24 Variable sized arrays for Decimal128.
/// Nested collections
/// Backlinks in BPlusTree
///
/// IMPORTANT: When introducing a new file format version, be sure to review
/// the file validity checks in Group::open() and DB::do_open, the file
Expand Down
28 changes: 28 additions & 0 deletions src/realm/obj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,34 @@ size_t Obj::get_backlink_cnt(ColKey backlink_col) const
return backlinks.get_backlink_count(m_row_ndx);
}

void Obj::verify_backlink(const Table& origin, ColKey origin_col_key, ObjKey origin_key) const
{
#ifdef REALM_DEBUG
ColKey backlink_col_key;
auto type = origin_col_key.get_type();
if (type == col_type_TypedLink || type == col_type_Mixed || origin_col_key.is_dictionary()) {
backlink_col_key = get_table()->find_backlink_column(origin_col_key, origin.get_key());
}
else {
backlink_col_key = origin.get_opposite_column(origin_col_key);
}

Allocator& alloc = get_alloc();
Array fields(alloc);
fields.init_from_mem(m_mem);

ArrayBacklink backlinks(alloc);
backlinks.set_parent(&fields, backlink_col_key.get_index().val + 1);
backlinks.init_from_parent();

REALM_ASSERT(backlinks.verify_backlink(m_row_ndx, origin_key.value));
#else
static_cast<void>(origin);
static_cast<void>(origin_col_key);
static_cast<void>(origin_key);
#endif
}

void Obj::traverse_path(Visitor v, PathSizer ps, size_t path_length) const
{
struct BacklinkTraverser : public LinkTranslator {
Expand Down
1 change: 1 addition & 0 deletions src/realm/obj.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ class Obj : public CollectionParent {
size_t get_backlink_count(const Table& origin, ColKey origin_col_key) const;
ObjKey get_backlink(const Table& origin, ColKey origin_col_key, size_t backlink_ndx) const;
TableView get_backlink_view(TableRef src_table, ColKey src_col_key);
void verify_backlink(const Table& origin, ColKey origin_col_key, ObjKey origin_key) const;

// To be used by the query system when a single object should
// be tested. Will allow a function to be called in the context
Expand Down
19 changes: 19 additions & 0 deletions test/test_bplus_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,4 +431,23 @@ TEST(BPlusTree_LeafCache)
tree.destroy();
}

TEST(BPlusTree_UpgradeFromArray)
{
Array arr(Allocator::get_default());
arr.create(Node::type_Normal, false, 0, 0);

for (int i = 0; i < 65; i++) {
arr.add(i);
}

BPlusTree<Int> tree(Allocator::get_default());
tree.init_from_ref(arr.get_ref());
tree.split_if_needed();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't do a split because there are less than 1000 elements, can you add more tests like this checking various sizes: 999, 1000, 1001, 1000*1000 + 1 etc

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added more sizes - but the test will still only be effective when node size is 4.


tree.add(100);
tree.verify();

tree.destroy();
}

#endif // TEST_BPLUS_TREE
Loading