Skip to content

Commit

Permalink
Greatly improve performance of sorting dictionaries
Browse files Browse the repository at this point in the history
Dictionary lookup by index is slow, so read all of the keys/values from the
dictionary into a vector and sort that instead. Using a dictionary iterator to
read the values replaces O(N log N) ClusterTree lookups with O(log N), so this
is asymptotically faster. The benchmark is sped up from 3.77 seconds to 21.62
milliseconds, but more reasonably sized Dictionaries will see proportionally
smaller benefits.

The downside of this is that the array of Mixed requires 24 bytes of scratch
space per element in the dictionary. We already require 8 bytes per element to
store the results, so this is just a constant factor increase rather than an
aysmtotic change in memory use and it's probably not a problem.
  • Loading branch information
tgoyne committed Jan 15, 2022
1 parent 733f127 commit b68dff4
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 35 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

### Enhancements
* <New feature description> (PR [#????](https://github.com/realm/realm-core/pull/????))
* None.
* Greatly improve the performance of sorting or distincting a Dictionary's keys or values. The most expensive operation is now performed O(log N) rather than O(N log N) times, and large Dictionaries can see upwards of 99% reduction in time to sort. ([PR #5166](https://github.com/realm/realm-core/pulls/5166))

### Fixed
* <How do the end-user experience this issue? what was the impact?> ([#????](https://github.com/realm/realm-core/issues/????), since v?.?.?)
Expand Down
74 changes: 47 additions & 27 deletions src/realm/dictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,27 +451,49 @@ void Dictionary::align_indices(std::vector<size_t>& indices) const
}
}

void Dictionary::sort(std::vector<size_t>& indices, bool ascending) const
namespace {
std::vector<Mixed> get_keys(const Dictionary& dict)
{
std::vector<Mixed> values;
values.reserve(dict.size());
for (auto it = dict.begin(), end = dict.end(); it != end; ++it)
values.push_back(it.key());
return values;
}

std::vector<Mixed> get_values(const Dictionary& dict)
{
std::vector<Mixed> values;
values.reserve(dict.size());
for (auto it = dict.begin(), end = dict.end(); it != end; ++it)
values.push_back(it.value());
return values;
}

void do_sort(std::vector<size_t>& indices, bool ascending, const std::vector<Mixed>& values)
{
align_indices(indices);
auto b = indices.begin();
auto e = indices.end();
std::sort(b, e, [this, ascending](size_t i1, size_t i2) {
auto v1 = get_any(i1);
auto v2 = get_any(i2);
return ascending ? v1 < v2 : v2 < v1;
std::sort(b, e, [ascending, &values](size_t i1, size_t i2) {
return ascending ? values[i1] < values[i2] : values[i2] < values[i1];
});
}
} // anonymous namespace

void Dictionary::distinct(std::vector<size_t>& indices, util::Optional<bool> ascending) const
void Dictionary::sort(std::vector<size_t>& indices, bool ascending) const
{
align_indices(indices);
do_sort(indices, ascending, get_values(*this));
}

bool sort_ascending = ascending ? *ascending : true;
sort(indices, sort_ascending);
void Dictionary::distinct(std::vector<size_t>& indices, util::Optional<bool> ascending) const
{
align_indices(indices);
auto values = get_values(*this);
do_sort(indices, ascending.value_or(true), values);
indices.erase(std::unique(indices.begin(), indices.end(),
[this](size_t i1, size_t i2) {
return get_any(i1) == get_any(i2);
[&values](size_t i1, size_t i2) {
return values[i1] == values[i2];
}),
indices.end());

Expand All @@ -484,13 +506,7 @@ void Dictionary::distinct(std::vector<size_t>& indices, util::Optional<bool> asc
void Dictionary::sort_keys(std::vector<size_t>& indices, bool ascending) const
{
align_indices(indices);
auto b = indices.begin();
auto e = indices.end();
std::sort(b, e, [this, ascending](size_t i1, size_t i2) {
auto k1 = get_key(i1);
auto k2 = get_key(i2);
return ascending ? k1 < k2 : k2 < k1;
});
do_sort(indices, ascending, get_keys(*this));
}

void Dictionary::distinct_keys(std::vector<size_t>& indices, util::Optional<bool>) const
Expand Down Expand Up @@ -1065,34 +1081,38 @@ Dictionary::Iterator::Iterator(const Dictionary* dict, size_t pos)
{
}

auto Dictionary::Iterator::operator*() const -> value_type
Mixed Dictionary::Iterator::key() const
{
update();
Mixed key;
switch (m_key_type) {
case type_String: {
ArrayString keys(m_tree.get_alloc());
ref_type ref = to_ref(Array::get(m_leaf.get_mem().get_addr(), 1));
keys.init_from_ref(ref);
key = Mixed(keys.get(m_state.m_current_index));
break;
return Mixed(keys.get(m_state.m_current_index));
}
case type_Int: {
ArrayInteger keys(m_tree.get_alloc());
ref_type ref = to_ref(Array::get(m_leaf.get_mem().get_addr(), 1));
keys.init_from_ref(ref);
key = Mixed(keys.get(m_state.m_current_index));
break;
return Mixed(keys.get(m_state.m_current_index));
}
default:
throw std::runtime_error("Not implemented");
break;
}
}

Mixed Dictionary::Iterator::value() const
{
ArrayMixed values(m_tree.get_alloc());
ref_type ref = to_ref(Array::get(m_leaf.get_mem().get_addr(), 2));
values.init_from_ref(ref);
return values.get(m_state.m_current_index);
}

return std::make_pair(key, values.get(m_state.m_current_index));
auto Dictionary::Iterator::operator*() const -> value_type
{
update();
return std::make_pair(key(), value());
}


Expand Down
3 changes: 3 additions & 0 deletions src/realm/dictionary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,9 @@ class Dictionary::Iterator : public ClusterTree::Iterator {
return ret;
}

Mixed key() const;
Mixed value() const;

private:
friend class Dictionary;
using ClusterTree::Iterator::get_position;
Expand Down
11 changes: 4 additions & 7 deletions test/benchmark-common-tasks/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1224,7 +1224,7 @@ struct BenchmarkSortInt : BenchmarkWithInts {
struct BenchmarkSortIntList : Benchmark {
const char* name() const
{
return "BenchmarkSortIntList";
return "SortIntList";
}

void before_all(DBRef group)
Expand Down Expand Up @@ -1267,12 +1267,9 @@ struct BenchmarkSortIntList : Benchmark {
};

struct BenchmarkSortIntDictionary : Benchmark {
// Dictionary sorting is sufficiently slow that we want to use a smaller size
static const size_t size = BASE_SIZE / 10;

const char* name() const
{
return "BenchmarkSortIntDictionary";
return "SortIntDictionary";
}

void before_all(DBRef group)
Expand All @@ -1285,11 +1282,11 @@ struct BenchmarkSortIntDictionary : Benchmark {

Dictionary dict = obj.get_dictionary(m_col);
Random r;
for (size_t i = 0; i < size; ++i) {
for (size_t i = 0; i < BASE_SIZE; ++i) {
dict.insert(util::to_string(i), r.draw_int<int64_t>());
}
tr.commit();
m_indices.reserve(size);
m_indices.reserve(BASE_SIZE);
}

void after_all(DBRef db)
Expand Down

0 comments on commit b68dff4

Please sign in to comment.