Initial commit.

tatami-inc · May 19, 2023 · 64a98b8 · 64a98b8
commit 64a98b8
Show file tree

Hide file tree

Showing 4 changed files with 228 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+build/
+*.swp
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.14)
+
+project(testing
+    VERSION 1.0.0
+    DESCRIPTION "A C++ interface for assorted matrix representations"
+    LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+
+add_executable(testing test.cpp)
+
+include(FetchContent)
+
+FetchContent_Declare(
+  tatami
+  GIT_REPOSITORY https://github.com/LTLA/tatami
+  GIT_TAG 0b3dfba5b040a0a93b818f955ed4b2a9a10e672d
+)
+
+FetchContent_MakeAvailable(tatami)
+
+FetchContent_Declare(
+  cli11
+  GIT_REPOSITORY https://github.com/CLIUtils/CLI11
+  GIT_TAG 291c587
+)
+
+FetchContent_MakeAvailable(cli11)
+
+target_link_libraries(testing tatami CLI11::CLI11)
diff --git a/README.md b/README.md
@@ -0,0 +1,64 @@
+# Performance testing for sparse priority queues
+
+## Motivation
+
+See [tatamic-inc/tatami#49](https://github.com/tatami-inc/tatami/issues/49) for the initial suggestion.
+
+The idea here is, when iterating over the secondary dimension of a sparse matrix,
+we use a priority queue to keep track of the current secondary index for each primary dimension element.
+Then, when we request the next secondary index, we only have to inspect the primary elements with secondary indices that fall below the requested one.
+This should reduce the search time when compared to a linear scan over the primary dimension elements... in theory.
+
+## Results
+
+In practice, the performance is disappointing:
+
+```
+$ ./build/testing
+Testing a 10000 x 100000 matrix with a density of 0.1
+Linear access time: 5544 for 99997920 non-zero elements
+Priority queue access time: 8713 for 99997920 non-zero elements
+```
+
+With hindsight, this is perhaps unsurprising.
+The heap takes `O(Nz * log(N))` time to depopulate and fill, given `N` columns and `Nz` non-zero elements.
+The linear search just takes `O(N)` time, which becomes faster than the heap when `Nz` is a decent fraction of `N`.
+Even at 5% density, we still observe a mild performance degradation:
+
+```
+$ ./build/testing -d 0.05
+Testing a 10000 x 100000 matrix with a density of 0.05
+Linear access time: 4098 for 49999099 non-zero elements
+Priority queue access time: 4598 for 49999099 non-zero elements
+```
+
+Eventually, at very low densities, we see the desired improvement:
+
+```
+$ ./build/testing -d 0.01
+Testing a 10000 x 100000 matrix with a density of 0.01
+Linear access time: 2245 for 9997846 non-zero elements
+Priority queue access time: 930 for 9997846 non-zero elements
+```
+
+# Discussion
+
+Turns out that linear iteration is pretty fast if there's no action involved other than to hit `continue`. 
+Indeed, **tatami** stores the `current_indices` vector that can be iterated contiguously in memory for optimal access, unlike the heap where everything is scattered around via push/pop cycles.
+I suspect that the linear search is also very friendly to the branch predictor for very sparse rows where most columns can be skipped.
+
+Note that this demonstration is already a best-case example of using the heap.
+If it's not a strict increment, we might need sort the indices of the non-zero elements at any given row, because they won't come out of the heap in order.
+Moreover, if there is a large jump between the previous and current requested row, the heap information is useless and we collapse back to `O(N)` time anyway (with extra overhead to rebuild the heap).
+
+I suppose we _could_ switch between the linear and heap methods depending on the sparsity of the matrix, but that seems pretty complicated.
+Given the absence of an unqualified performance benefit, I will prefer the simplicity of the linear search, 
+
+# Build instructions
+
+Just use the usual CMake process:
+
+```cmake
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build
+```
diff --git a/test.cpp b/test.cpp
@@ -0,0 +1,132 @@
+#include "CLI/App.hpp"
+#include "CLI/Formatter.hpp"
+#include "CLI/Config.hpp"
+
+#include "tatami/tatami.hpp"
+
+#include <chrono>
+#include <vector>
+#include <queue>
+#include <random>
+
+int main(int argc, char* argv []) {
+    CLI::App app{"Sparse priority queue testing"};
+    double density;
+    app.add_option("-d,--density", density, "Density of the sparse matrix")->default_val(0.1);
+    int nr;
+    app.add_option("-r,--nrow", nr, "Number of rows")->default_val(10000);
+    int nc;
+    app.add_option("-c,--ncol", nc, "Number of columns")->default_val(100000);
+    CLI11_PARSE(app, argc, argv);
+
+    std::cout << "Testing a " << nr << " x " << nc << " matrix with a density of " << density << std::endl;
+
+    // Simulating a sparse matrix, albeit not very efficiently, but whatever.
+    std::vector<int> i, j;
+    std::vector<double> x;
+
+    std::mt19937_64 generator(1234567);
+    std::uniform_real_distribution<double> distu;
+    std::normal_distribution<double> distn;
+
+    for (size_t c = 0; c < nc; ++c) {
+        for (size_t r = 0; r < nr; ++r) {
+            if (distu(generator) <= density) {
+                i.push_back(r);
+                j.push_back(c);
+                x.push_back(distn(generator));
+            }
+        }
+    }
+
+    auto indptrs = tatami::compress_sparse_triplets<false>(nr, nc, x, i, j);
+    tatami::ArrayView<double> x_view (x.data(), x.size());
+    tatami::ArrayView<int> i_view (i.data(), i.size());
+    tatami::ArrayView<size_t> p_view (indptrs.data(), indptrs.size());
+    std::shared_ptr<tatami::NumericMatrix> mat(new tatami::CompressedSparseColumnMatrix<double, int, decltype(x_view), decltype(i_view), decltype(p_view)>(nr, nc, x_view, i_view, p_view));
+
+    // Doing the naive linear iteration that's currently in tatami.
+    std::vector<double> xbuffer(mat->ncol());
+    std::vector<int> ibuffer(mat->ncol());
+
+    auto start = std::chrono::high_resolution_clock::now();
+    auto wrk = mat->sparse_row();
+    int sum = 0;
+    for (int r = 0; r < mat->nrow(); ++r) {
+        auto range = wrk->fetch(r, xbuffer.data(), ibuffer.data());
+        sum += range.number;
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
+    std::cout << "Linear access time: " << duration.count() << " for " << sum << " non-zero elements" << std::endl;
+
+    // Comparing to a priority queue.
+    start = std::chrono::high_resolution_clock::now();
+
+    typedef std::pair<int, int> HeapElement;
+    std::priority_queue<HeapElement, std::vector<HeapElement>, std::greater<HeapElement> > next_heap;
+    std::vector<int> hits, tmp_hits;
+    auto state = indptrs;
+
+    hits.resize(mat->ncol());
+    for (int c = 0; c < mat->ncol(); ++c) { // force everything to be re-searched on initialization.
+        hits[c] = c;
+        --state[c];
+    }
+
+    sum = 0;
+    for (int r = 0; r < mat->nrow(); ++r) {
+        tmp_hits.swap(hits);
+        hits.clear();
+
+        for (auto x : tmp_hits) {
+            ++state[x];
+            if (state[x] < indptrs[x + 1]) {
+                int current = i[state[x]];
+                if (current == r) {
+                    hits.push_back(x);
+                } else {
+                    next_heap.emplace(current, x);
+                }
+            }
+        }
+
+        while (!next_heap.empty()) {
+            auto current_secondary = next_heap.top().first;
+            if (current_secondary > r) {
+                break;
+            }
+            auto current_primary_index = next_heap.top().second;
+            next_heap.pop();
+
+            if (current_secondary < r) {
+                ++state[current_primary_index];
+                if (state[current_primary_index] < indptrs[current_primary_index + 1]) {
+                    int next_secondary = i[state[current_primary_index]];
+                    if (next_secondary == r) {
+                        hits.push_back(current_primary_index);
+                    } else {
+                        next_heap.emplace(next_secondary, current_primary_index);
+                    }
+                }
+            } else {
+                hits.push_back(current_primary_index);
+            }
+        }
+
+        // Copy indices over for a fair comparison.
+        for (size_t h = 0; h < hits.size(); ++h) {
+            ibuffer[h] = hits[h];
+            xbuffer[h] = x[state[h]];
+        }
+
+        sum += hits.size();
+    } 
+
+    stop = std::chrono::high_resolution_clock::now();
+    duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
+    std::cout << "Priority queue access time: " << duration.count() << " for " << sum << " non-zero elements" << std::endl;
+
+    return 0;
+}
+