initial

aderk · Apr 4, 2024 · 1e799b1 · 1e799b1
commit 1e799b1
Show file tree

Hide file tree

Showing 7 changed files with 294 additions and 0 deletions.
diff --git a/.github/workflows/test_and_bench.yaml b/.github/workflows/test_and_bench.yaml
@@ -0,0 +1,48 @@
+name: Test and Run
+
+on:
+  pull_request:
+  push:
+  workflow_dispatch:
+
+jobs:
+  test-examples:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+    env:
+      DEBIAN_FRONTEND: noninteractive
+      LLVM_VERSION: 17
+
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      - name: "Setup conda env (base)"
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          python-version: 3.11
+          auto-activate-base: true
+
+      - name: "Install mojo"
+        run: |
+          curl https://get.modular.com | sh - && \
+          modular auth ${{secrets.MODULAR_AUTH}} && \
+          modular install --install-version 24.2.0 mojo
+
+      - name: "Setup conda env"
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          python-version: 3.11
+          activate-environment: base
+
+      - name: "Install pip deps"
+        run: pip install rbloom
+
+      - name: "Run main"
+        run: |
+          export MODULAR_HOME="/home/runner/.modular"
+          export PATH="/home/runner/.modular/pkg/packages.modular.com_mojo/bin:$PATH"
+          export MOJO_PYTHON_LIBRARY="$(find $CONDA_PREFIX/lib -iname 'libpython*.[s,d]*' | sort -r | head -n 1)"
+          mojo run main.🔥
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Alan Derk
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,50 @@
+# Fireweed
+A toy project of trying out Mojo🔥 with a Bloom filter. Why a Bloom filter?
+ * Limited API
+ * Super fun w/ probability and bit operations
+   * Seemingly does the impossible measuring containment w/o storing more than several bits per item
+ * Useful
+ * Extendable to Cuckoo, kMinHash, HyperLogLog(++), Counting Bloom Filters, etc.
+
+This project borrows heavily from https://github.com/KenanHanke/rbloom, which is much more production-worthy and better commented/robust/tested.
+
+# Installation And Running
+1. Install Mojo
+2. Install python (probably via miniconda)
+3. Set MOJO_PYTHON_LIBRARY
+4. pip install rbloom
+5. mojo run main.🔥
+
+# Benchmark Results
+Benchmarks always have some unfairness, so take these with a grain of salt. This compares python with a rust library to mojo with a mojo library, which is very much apples-to-oranges.
+
+```
+alan@mbalan ~/C/fireweed (main) [1]> mojo run main.🔥           (py310) 
+===rbloom/python results===
+bf=<Bloom size_in_bits=95850584 approx_items=0.0>
+add_time=0.9502909183502197
+total_time=1.6721301078796387
+
+
+num_hash_funcs= 6
+filter_size_total_bits= 95850624
+===mojo results===
+---------------------
+Benchmark Report (s)
+---------------------
+Mean: 0.61287924999999999
+Total: 2.4515169999999999
+Iters: 4
+Warmup Mean: 0.61581649999999999
+Warmup Total: 1.231633
+Warmup Iters: 2
+Fastest Mean: 0.61287924999999999
+Slowest Mean: 0.61287924999999999
+```
+
+# Future work
+ * Investigate TODOs/HACKs/MUSINGs
+ * Currently it scales very poorly above ~100k elements on an M2 MBA, not sure what's going on there.
+ * Performance profiling
+ * Write a real test suite 
+
diff --git a/__pycache__/bloompy.cpython-311.pyc b/__pycache__/bloompy.cpython-311.pyc
diff --git a/bloom.🔥 b/bloom.🔥
@@ -0,0 +1,88 @@
+import math
+from collections.vector import InlinedFixedVector
+
+struct Bloom[filter_size_64bits: Int, num_hashes: Int]():
+    """Implements a bloom filter with addition and probabilistic containment.
+
+     This provides small space, only true negatives for containment, and some
+     false positives. 
+
+    Parameters:
+        filter_size_64bits: Internal state size in 64bit increments.
+        num_hashes: Number of buckets/indicies in the filter to fill per addition.
+    """
+
+    # TODO: aliases cannot access aliases
+    # TODO: Add aliases/params for chunk_type, simdwidth, chunk_size
+    var _state: DTypePointer[DType.uint64]
+
+    fn __init__(inout self):
+        # TODO: Add destructor or determine not necessary.
+        self._state = self._state.alloc(filter_size_64bits)
+        memset_zero(self._state, filter_size_64bits)
+
+    fn __del__(owned self):
+        self._state.free()
+
+    @always_inline
+    fn add[T: Hashable](inout self, el: T):
+        """Adds an element to the bloom filter."""
+        var idxs = self._generate_indicies(el)
+        @unroll(4)
+        for i in range(num_hashes):
+            var bit_loc = idxs[i]
+            var filter_chunk = bit_loc // 64
+            var loc_within_chunk = bit_loc % 64
+            self._state[filter_chunk.to_int()] |= ( 1 << loc_within_chunk)
+
+    @always_inline
+    fn __contains__[T: Hashable](self, el: T) -> Bool:
+        """Returns containment status w/ false positives."""
+        var idxs = self._generate_indicies(el)
+        @unroll(4)
+        for i in range(num_hashes):
+            var bit_loc = idxs[i]
+            var filter_chunk = bit_loc // 64
+            var loc_within_chunk = bit_loc % 64
+            if (self._state[filter_chunk.to_int()] & (UInt64(1) << loc_within_chunk)) == UInt64(0):
+                return False
+        return True
+
+    @always_inline
+    fn _generate_indicies[T: Hashable](self, el: T) -> InlinedFixedVector[UInt64, num_hashes]:
+        # Generates num_hashes indicies between [0,64*filter_size).
+        # Linear Congruential Generator params taken from https://nuclear.llnl.gov/CNP/rng/rngman/node4.html
+        var a = 2862933555777941757
+        var b = 3037000493
+        var ret = InlinedFixedVector[UInt64, num_hashes](num_hashes)
+        var entropy: UInt64 = hash(el)
+        @unroll(4)
+        for i in range(num_hashes):
+            # MUSING: I think I can use __setitem__ but it won't change the size of ret.
+            ret[i] = (entropy % (filter_size_64bits*64)).to_int()
+            entropy = a * entropy + b
+        return ret
+
+
+# TODO: Move these fn's to a class method `from_expectation`. It proved
+# difficult to have an unbound return type on a class method.
+@parameter
+fn filter_size(expected_items: Int, false_pos_rate: Float64) -> Int:
+    # Calculates how many bits to use in the filter. Rounds up in simd_width increments.
+    # TODO: Add citation for formula
+    # TODO: Why is math.log() so hard to resolve types for!?!
+    var log2 = math.log(Float64(2.0))
+    var raw_bit_length = -1 * (expected_items * math.log(false_pos_rate) / log2**2).to_int()
+    var ret = raw_bit_length // 64 + 1
+    return ret
+
+@parameter
+fn num_hash_funcs(expected_items: Int, false_pos_rate: Float64) -> Int:
+    # Number of bits to set, each with a different hash func, commonly called `k`.
+    # TODO: Add citation for formula.
+    # HACK: Use SIMD type by rounding to powers of 2.
+    var filter_size_64bits = filter_size(expected_items, false_pos_rate)
+    var log2 = math.log(Float64(2.0))
+    var ret = (filter_size_64bits * 64 / expected_items) * log2
+    return ret.to_int()
+
diff --git a/bloompy.py b/bloompy.py
@@ -0,0 +1,17 @@
+import rbloom
+import time
+
+def run_python_benchmark(size):
+    bf = rbloom.Bloom(size, 0.01)
+    print(f"{bf=}")
+    start = time.time()
+    for i in range(size):
+        bf.add(i + 0.5)  
+
+    add_time = time.time() - start
+    for i in range(size):
+        assert i + 0.5 in bf
+
+    total_time = time.time() - start
+
+    print(f"{add_time=}\n{total_time=}\n\n", flush=True)
diff --git a/main.🔥 b/main.🔥
@@ -0,0 +1,70 @@
+from python import Python
+import bloom
+import benchmark
+import time
+import testing
+
+# TODO: Get ELEMENTS to work as a parameter, alias ELEMENTS = 10_000_000
+# Must edit on lines ~29-30.
+
+fn main() raises:
+    run_python()
+    test_mojo()
+    run_mojo()
+
+
+fn run_python() raises:
+    print("===rbloom/python results===")
+    Python.add_to_path(".")
+    var bloompy = Python.import_module("bloompy")
+    bloompy.run_python_benchmark(10_000_000)
+
+fn run_mojo():
+    print("===mojo results===")
+    var report = benchmark.run[bloom_mojo_single_run]()
+    report.print()
+
+fn bloom_mojo_single_run():
+    var bf = bloom.Bloom[
+        bloom.filter_size(10_000_000, 0.01),
+        bloom.num_hash_funcs(10_000_000, 0.01),
+        ]()
+
+    for i in range(10_000_000):
+        # match python impl to avoid special-case int hashing.
+        bf.add(i + 0.5) 
+
+    for i in range(10_000_000):
+        var _unused = i + 0.5 in bf
+        benchmark.keep(_unused)
+
+
+fn test_mojo() raises:
+    var bf = bloom.Bloom[
+        bloom.filter_size(10_000_000, 0.01),
+        bloom.num_hash_funcs(10_000_000, 0.01), 
+        ]()
+    print("num_hash_funcs=",bloom.num_hash_funcs(10_000_000, 0.01))
+    print("filter_size_total_bits=", bloom.filter_size(10_000_000, 0.01)*64)
+
+
+    for i in range(0, 10_000_000 * 2, 2):
+        bf.add(i + 0.5)  
+
+    for i in range(0, 10_000_000 * 2, 2):
+        testing.assert_true(i + 0.5 in bf, "did not find value when expected:" + str(i+0.5))
+
+    var trues = 0
+    for i in range(1, 10_000_000 * 2, 2):
+        if i+0.5 in bf:
+            trues += 1
+    testing.assert_true(trues / 10_000_000 < 0.10, "found too many false positives: " + str(trues))
+
+
+
+
+
+
+
+
+