Skip to content

Commit

Permalink
Began streamlining the oracle-aware caching.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Apr 7, 2024
1 parent e460e6d commit 0534572
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 199 deletions.
210 changes: 85 additions & 125 deletions include/tatami_chunked/OracleSlabCache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,44 +27,34 @@ namespace tatami_chunked {
*/
template<typename Id_, typename Index_, class Slab_>
class OracleSlabCache {
private:
std::shared_ptr<const tatami::Oracle<Index_> > oracle;
size_t max_predictions;
size_t max_slabs;
size_t total;
size_t counter = 0;

private:
std::list<Slab_> slab_cache, tmp_cache, free_cache;

typedef typename std::list<Slab_>::iterator cache_iterator;
std::unordered_map<Id_, std::pair<Index_, cache_iterator> > slab_exists, past_exists;

std::vector<std::pair<Index_, Index_> > predictions_made;
size_t predictions_fulfilled = 0;

std::vector<Slab_*> slab_pointers;
Index_ last_slab_id = 0;
Slab_* last_slab = NULL;

std::vector<std::pair<Index_, cache_iterator*> > unassigned_slabs;
std::vector<std::pair<Id_, Index_> > slabs_to_populate;
size_t max_slabs;
std::vector<Slab_> all_slabs;
std::unordered_map<Id_, Slab_*> current_cache, future_cache;
std::vector<std::pair<Id_, Slab_*> > to_populate;
std::vector<Id_> to_reassign;
size_t refresh_point = 0;

public:
/**
* @param ora Pointer to an `tatami::Oracle` to be used for predictions.
* @param per_iteration Maximum number of predictions to make per iteration.
* @param num_slabs Maximum number of slabs to store.
*/
OracleSlabCache(std::shared_ptr<const tatami::Oracle<Index_> > ora, [[maybe_unused]] size_t per_iteration, size_t num_slabs) :
OracleSlabCache(std::shared_ptr<const tatami::Oracle<Index_> > ora, size_t num_slabs) :
oracle(std::move(ora)),
max_predictions(oracle->total()),
max_slabs(num_slabs)
total(oracle->total()),
max_slabs(num_slabs)
{
slab_exists.reserve(max_slabs);
past_exists.reserve(max_slabs);

predictions_made.reserve(max_predictions);

slab_pointers.reserve(max_slabs);
unassigned_slabs.reserve(max_slabs);
slabs_to_populate.reserve(max_slabs);
all_slabs.reserve(max_slabs);
current_cache.reserve(max_slabs);
future_cache.reserve(max_slabs);
}

/**
Expand All @@ -76,12 +66,6 @@ class OracleSlabCache {
* @endcond
*/

private:
std::pair<const Slab_*, Index_> fetch(size_t i) const {
const auto& current = predictions_made[i];
return std::pair<const Slab_*, Index_>(slab_pointers[current.first], current.second);
}

public:
/**
* This method is intended to be called when `num_slabs = 0`, to provide callers with the oracle predictions for non-cached extraction of data.
Expand All @@ -108,116 +92,92 @@ class OracleSlabCache {
* For example, if each chunk takes up 10 rows, attempting to access row 21 would require retrieval of slab 2 and an offset of 1.
* @param create Function that accepts no arguments and returns a `Slab_` object with sufficient memory to hold a slab's contents when used in `populate()`.
* This may also return a default-constructed `Slab_` object if the allocation is done dynamically per slab in `populate()`.
* @param populate Function that accepts two arguments, `slabs_in_need` and `slab_data`.
* (1) `slabs_in_need` is a `const std::vector<std::pair<Id_, Index_> >&` specifying the slabs to be populated.
* @param populate Function that accepts a `std::vector<std::pair<Id_, Slab_*> >&` specifying the slabs to be populated.
* The first `Id_` element of each pair contains the slab identifier, i.e., the first element returned by the `identify` function.
* The second `Index_` element specifies the index in `slab_data` in which to store the contents of each slab.
* (2) `slab_data` is a `std::vector<Slab_*>&` containing pointers to the cached slab contents to be populated.
* This function should iterate over the `slabs_in_need` and populate the corresponding entries in `slab_data`.
* The second `Slab_*` element contains a pointer to a `Slab_` returned by `create()`.
* This function should iterate over the vector and populate each slab.
* Note that the vector is not guaranteed to be sorted.
*
* @return Pair containing (1) a pointer to a slab's contents and (2) the index of the next predicted row/column inside the retrieved slab.
*/
template<class Ifunction_, class Cfunction_, class Pfunction_>
std::pair<const Slab_*, Index_> next(Ifunction_ identify, Cfunction_ create, Pfunction_ populate) {
if (predictions_made.size() > predictions_fulfilled) {
return fetch(predictions_fulfilled++);
Index_ index = this->next();
auto slab_info = identify(index);
if (slab_info.first == last_slab_id && last_slab) {
return std::make_pair(last_slab, slab_info.second);
}

predictions_made.clear();
size_t used = 0;

// Iterators in the unordered_map should remain valid after swapping the containers,
// see https://stackoverflow.com/questions/4124989/does-stdvectorswap-invalidate-iterators
tmp_cache.swap(slab_cache);

past_exists.swap(slab_exists);
slab_exists.clear();

slab_pointers.clear();
unassigned_slabs.clear();
slabs_to_populate.clear();

while (counter < max_predictions) {
Index_ current = this->next();

auto slab_id = identify(current);
auto curslab = slab_id.first;
auto curindex = slab_id.second;

auto it = slab_exists.find(curslab);
if (it != slab_exists.end()) {
predictions_made.emplace_back((it->second).first, curindex);

} else if (used < max_slabs) {
auto past = past_exists.find(curslab);
if (past != past_exists.end()) {
auto sIt = (past->second).second;
slab_cache.splice(slab_cache.end(), tmp_cache, sIt);
slab_pointers.push_back(&(*sIt));
slab_exists[curslab] = std::make_pair(used, sIt);

} else {
if (free_cache.empty()) {
// We might be able to recycle an existing slab from tmp_cache
// to populate 'curslab'... but we don't know if we can do so at
// this moment, as those slabs might be needed by later predictions.
// So we just defer the creation of a new slab until we've run
// through the set of predictions for this round.
auto ins = slab_exists.insert(std::make_pair(curslab, std::make_pair(used, slab_cache.end())));
unassigned_slabs.emplace_back(used, &(ins.first->second.second));
slab_pointers.push_back(NULL);

} else {
auto sIt = free_cache.begin();
slab_cache.splice(slab_cache.end(), free_cache, sIt);
slab_pointers.push_back(&(*sIt));
slab_exists[curslab] = std::make_pair(used, sIt);
last_slab_id = slab_info.first;

// Updating the cache if we hit the refresh point.
if (counter - 1 == refresh_point) {
requisition_slab(slab_info.first, create);
size_t used_slabs = 1;
auto last_future_slab_id = slab_info.first;

while (++refresh_point < total) {
auto future_index = oracle->get(refresh_point);
auto future_slab_info = identify(future_index);
if (last_future_slab_id != future_slab_info.first) {
if (future_cache.find(future_slab_info.first) == future_cache.end()) {
if (used_slabs == max_slabs) {
break;
}
requisition_slab(future_slab_info.first, create);
++used_slabs;
}

slabs_to_populate.emplace_back(curslab, used);
}

predictions_made.emplace_back(used, curindex);
++used;

} else {
--counter;
break;
}
}

while (!unassigned_slabs.empty()) {
cache_iterator it;
if (!tmp_cache.empty()) {
it = tmp_cache.begin();
slab_cache.splice(slab_cache.end(), tmp_cache, it);
} else {
slab_cache.emplace_back(create());
it = slab_cache.end();
--it;
auto cIt = current_cache.begin();
for (auto a : to_reassign) {
to_populate.emplace_back(a, cIt->second);
future_cache[a] = cIt->second;
++cIt;
}

auto& last = unassigned_slabs.back();
slab_pointers[last.first] = &(*it);

// This changes the value in the slab_exists map without having to do a look-up, see:
// https://stackoverflow.com/questions/16781886/can-we-store-unordered-maptiterator
*(last.second) = it;

unassigned_slabs.pop_back();
// We always fill future_cache to the brim so every entry of
// all_slabs should be referenced by a pointer in future_cache.
// There shouldn't be any free cache entries remaining in
// current_cache i.e., at this point, cIt should equal
// current_cache.end(), as we transferred everything to
// future_cache. Thus it is safe to clear current_cache without
// worrying about leaking memory. The only exception is if we're at
// the end of the predictions, in which case it doesn't matter.
current_cache.clear();
to_reassign.clear();

populate(to_populate);

current_cache.swap(future_cache);
to_populate.clear();
}

while (!tmp_cache.empty()) {
free_cache.splice(free_cache.end(), tmp_cache, tmp_cache.begin());
}
// We know it must exist, so no need to check ccIt's validity.
auto ccIt = current_cache.find(slab_info.first);
last_slab = ccIt->second;
return std::make_pair(last_slab, slab_info.second);
}

if (!slabs_to_populate.empty()) {
populate(slabs_to_populate, slab_pointers);
private:
template<class Cfunction_>
void requisition_slab(Id_ slab_id, Cfunction_ create) {
auto ccIt = current_cache.find(slab_id);
if (ccIt != current_cache.end()) {
auto slab_ptr = ccIt->second;
future_cache[slab_id] = slab_ptr;
current_cache.erase(ccIt);

} else if (all_slabs.size() < max_slabs) {
all_slabs.emplace_back(create());
auto slab_ptr = &(all_slabs.back());
future_cache[slab_id] = slab_ptr;
to_populate.emplace_back(slab_id, slab_ptr);

} else {
future_cache[slab_id] = NULL;
to_reassign.push_back(slab_id);
}

// Well, because we just used one.
predictions_fulfilled = 1;
return fetch(0);
}
};

Expand Down
4 changes: 2 additions & 2 deletions include/tatami_chunked/custom_chunk_coordinator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -606,9 +606,9 @@ class ChunkCoordinator {
/* create = */ [&]() -> Slab {
return Slab(alloc);
},
/* populate =*/ [&](const std::vector<std::pair<Index_, Index_> >& in_need, auto& data) -> void {
/* populate =*/ [&](const std::vector<std::pair<Index_, Slab_*> >& to_populate) -> void {
for (const auto& p : in_need) {
fetch_block(p.first, 0, get_primary_chunkdim<accrow_>(p.first), *(data[p.second]));
fetch_block(p.first, 0, get_primary_chunkdim<accrow_>(p.first), *(p.second));
}
}
);
Expand Down
2 changes: 1 addition & 1 deletion include/tatami_chunked/typical_slab_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ struct TypicalSlabCacheWorkspace {
if constexpr(!oracle_) {
cache = LruSlabCache<Index_, Slab_>(num_slabs_in_cache);
} else if constexpr(!subset_) {
cache = OracleSlabCache<Index_, Index_, Slab_>(std::move(oracle), 10000, num_slabs_in_cache);
cache = OracleSlabCache<Index_, Index_, Slab_>(std::move(oracle), num_slabs_in_cache);
} else {
cache = SubsettedOracleSlabCache<Index_, Index_, Slab_>(std::move(oracle), 10000, num_slabs_in_cache);
}
Expand Down
6 changes: 3 additions & 3 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ add_executable(
libtest
src/LruSlabCache.cpp
src/OracleSlabCache.cpp
src/SubsettedOracleSlabCache.cpp
# src/SubsettedOracleSlabCache.cpp
src/mock_dense_chunk.cpp
src/mock_sparse_chunk.cpp
src/CustomDenseChunkedMatrix.cpp
src/CustomSparseChunkedMatrix.cpp
# src/CustomDenseChunkedMatrix.cpp
# src/CustomSparseChunkedMatrix.cpp
)

set(CODE_COVERAGE OFF CACHE BOOL "Enable coverage testing")
Expand Down
Loading

0 comments on commit 0534572

Please sign in to comment.