-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 1e799b1
Showing
7 changed files
with
294 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
name: Test and Run | ||
|
||
on: | ||
pull_request: | ||
push: | ||
workflow_dispatch: | ||
|
||
jobs: | ||
test-examples: | ||
runs-on: ubuntu-latest | ||
defaults: | ||
run: | ||
shell: bash | ||
env: | ||
DEBIAN_FRONTEND: noninteractive | ||
LLVM_VERSION: 17 | ||
|
||
steps: | ||
- name: Checkout repo | ||
uses: actions/checkout@v4 | ||
|
||
- name: "Setup conda env (base)" | ||
uses: conda-incubator/setup-miniconda@v2 | ||
with: | ||
python-version: 3.11 | ||
auto-activate-base: true | ||
|
||
- name: "Install mojo" | ||
run: | | ||
curl https://get.modular.com | sh - && \ | ||
modular auth ${{secrets.MODULAR_AUTH}} && \ | ||
modular install --install-version 24.2.0 mojo | ||
- name: "Setup conda env" | ||
uses: conda-incubator/setup-miniconda@v2 | ||
with: | ||
python-version: 3.11 | ||
activate-environment: base | ||
|
||
- name: "Install pip deps" | ||
run: pip install rbloom | ||
|
||
- name: "Run main" | ||
run: | | ||
export MODULAR_HOME="/home/runner/.modular" | ||
export PATH="/home/runner/.modular/pkg/packages.modular.com_mojo/bin:$PATH" | ||
export MOJO_PYTHON_LIBRARY="$(find $CONDA_PREFIX/lib -iname 'libpython*.[s,d]*' | sort -r | head -n 1)" | ||
mojo run main.🔥 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2024 Alan Derk | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# Fireweed | ||
A toy project of trying out Mojo🔥 with a Bloom filter. Why a Bloom filter? | ||
* Limited API | ||
* Super fun w/ probability and bit operations | ||
* Seemingly does the impossible measuring containment w/o storing more than several bits per item | ||
* Useful | ||
* Extendable to Cuckoo, kMinHash, HyperLogLog(++), Counting Bloom Filters, etc. | ||
|
||
This project borrows heavily from https://github.com/KenanHanke/rbloom, which is much more production-worthy and better commented/robust/tested. | ||
|
||
# Installation And Running | ||
1. Install Mojo | ||
2. Install python (probably via miniconda) | ||
3. Set MOJO_PYTHON_LIBRARY | ||
4. pip install rbloom | ||
5. mojo run main.🔥 | ||
|
||
# Benchmark Results | ||
Benchmarks always have some unfairness, so take these with a grain of salt. This compares python with a rust library to mojo with a mojo library, which is very much apples-to-oranges. | ||
|
||
``` | ||
alan@mbalan ~/C/fireweed (main) [1]> mojo run main.🔥 (py310) | ||
===rbloom/python results=== | ||
bf=<Bloom size_in_bits=95850584 approx_items=0.0> | ||
add_time=0.9502909183502197 | ||
total_time=1.6721301078796387 | ||
num_hash_funcs= 6 | ||
filter_size_total_bits= 95850624 | ||
===mojo results=== | ||
--------------------- | ||
Benchmark Report (s) | ||
--------------------- | ||
Mean: 0.61287924999999999 | ||
Total: 2.4515169999999999 | ||
Iters: 4 | ||
Warmup Mean: 0.61581649999999999 | ||
Warmup Total: 1.231633 | ||
Warmup Iters: 2 | ||
Fastest Mean: 0.61287924999999999 | ||
Slowest Mean: 0.61287924999999999 | ||
``` | ||
|
||
# Future work | ||
* Investigate TODOs/HACKs/MUSINGs | ||
* Currently it scales very poorly above ~100k elements on an M2 MBA, not sure what's going on there. | ||
* Performance profiling | ||
* Write a real test suite | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import math | ||
from collections.vector import InlinedFixedVector | ||
|
||
struct Bloom[filter_size_64bits: Int, num_hashes: Int](): | ||
"""Implements a bloom filter with addition and probabilistic containment. | ||
|
||
This provides small space, only true negatives for containment, and some | ||
false positives. | ||
|
||
Parameters: | ||
filter_size_64bits: Internal state size in 64bit increments. | ||
num_hashes: Number of buckets/indicies in the filter to fill per addition. | ||
""" | ||
|
||
# TODO: aliases cannot access aliases | ||
# TODO: Add aliases/params for chunk_type, simdwidth, chunk_size | ||
var _state: DTypePointer[DType.uint64] | ||
|
||
fn __init__(inout self): | ||
# TODO: Add destructor or determine not necessary. | ||
self._state = self._state.alloc(filter_size_64bits) | ||
memset_zero(self._state, filter_size_64bits) | ||
|
||
fn __del__(owned self): | ||
self._state.free() | ||
|
||
@always_inline | ||
fn add[T: Hashable](inout self, el: T): | ||
"""Adds an element to the bloom filter.""" | ||
var idxs = self._generate_indicies(el) | ||
@unroll(4) | ||
for i in range(num_hashes): | ||
var bit_loc = idxs[i] | ||
var filter_chunk = bit_loc // 64 | ||
var loc_within_chunk = bit_loc % 64 | ||
self._state[filter_chunk.to_int()] |= ( 1 << loc_within_chunk) | ||
|
||
@always_inline | ||
fn __contains__[T: Hashable](self, el: T) -> Bool: | ||
"""Returns containment status w/ false positives.""" | ||
var idxs = self._generate_indicies(el) | ||
@unroll(4) | ||
for i in range(num_hashes): | ||
var bit_loc = idxs[i] | ||
var filter_chunk = bit_loc // 64 | ||
var loc_within_chunk = bit_loc % 64 | ||
if (self._state[filter_chunk.to_int()] & (UInt64(1) << loc_within_chunk)) == UInt64(0): | ||
return False | ||
return True | ||
|
||
@always_inline | ||
fn _generate_indicies[T: Hashable](self, el: T) -> InlinedFixedVector[UInt64, num_hashes]: | ||
# Generates num_hashes indicies between [0,64*filter_size). | ||
# Linear Congruential Generator params taken from https://nuclear.llnl.gov/CNP/rng/rngman/node4.html | ||
var a = 2862933555777941757 | ||
var b = 3037000493 | ||
var ret = InlinedFixedVector[UInt64, num_hashes](num_hashes) | ||
var entropy: UInt64 = hash(el) | ||
@unroll(4) | ||
for i in range(num_hashes): | ||
# MUSING: I think I can use __setitem__ but it won't change the size of ret. | ||
ret[i] = (entropy % (filter_size_64bits*64)).to_int() | ||
entropy = a * entropy + b | ||
return ret | ||
|
||
|
||
# TODO: Move these fn's to a class method `from_expectation`. It proved | ||
# difficult to have an unbound return type on a class method. | ||
@parameter | ||
fn filter_size(expected_items: Int, false_pos_rate: Float64) -> Int: | ||
# Calculates how many bits to use in the filter. Rounds up in simd_width increments. | ||
# TODO: Add citation for formula | ||
# TODO: Why is math.log() so hard to resolve types for!?! | ||
var log2 = math.log(Float64(2.0)) | ||
var raw_bit_length = -1 * (expected_items * math.log(false_pos_rate) / log2**2).to_int() | ||
var ret = raw_bit_length // 64 + 1 | ||
return ret | ||
|
||
@parameter | ||
fn num_hash_funcs(expected_items: Int, false_pos_rate: Float64) -> Int: | ||
# Number of bits to set, each with a different hash func, commonly called `k`. | ||
# TODO: Add citation for formula. | ||
# HACK: Use SIMD type by rounding to powers of 2. | ||
var filter_size_64bits = filter_size(expected_items, false_pos_rate) | ||
var log2 = math.log(Float64(2.0)) | ||
var ret = (filter_size_64bits * 64 / expected_items) * log2 | ||
return ret.to_int() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import rbloom | ||
import time | ||
|
||
def run_python_benchmark(size): | ||
bf = rbloom.Bloom(size, 0.01) | ||
print(f"{bf=}") | ||
start = time.time() | ||
for i in range(size): | ||
bf.add(i + 0.5) | ||
|
||
add_time = time.time() - start | ||
for i in range(size): | ||
assert i + 0.5 in bf | ||
|
||
total_time = time.time() - start | ||
|
||
print(f"{add_time=}\n{total_time=}\n\n", flush=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from python import Python | ||
import bloom | ||
import benchmark | ||
import time | ||
import testing | ||
|
||
# TODO: Get ELEMENTS to work as a parameter, alias ELEMENTS = 10_000_000 | ||
# Must edit on lines ~29-30. | ||
|
||
fn main() raises: | ||
run_python() | ||
test_mojo() | ||
run_mojo() | ||
|
||
|
||
fn run_python() raises: | ||
print("===rbloom/python results===") | ||
Python.add_to_path(".") | ||
var bloompy = Python.import_module("bloompy") | ||
bloompy.run_python_benchmark(10_000_000) | ||
|
||
fn run_mojo(): | ||
print("===mojo results===") | ||
var report = benchmark.run[bloom_mojo_single_run]() | ||
report.print() | ||
|
||
fn bloom_mojo_single_run(): | ||
var bf = bloom.Bloom[ | ||
bloom.filter_size(10_000_000, 0.01), | ||
bloom.num_hash_funcs(10_000_000, 0.01), | ||
]() | ||
|
||
for i in range(10_000_000): | ||
# match python impl to avoid special-case int hashing. | ||
bf.add(i + 0.5) | ||
|
||
for i in range(10_000_000): | ||
var _unused = i + 0.5 in bf | ||
benchmark.keep(_unused) | ||
|
||
|
||
fn test_mojo() raises: | ||
var bf = bloom.Bloom[ | ||
bloom.filter_size(10_000_000, 0.01), | ||
bloom.num_hash_funcs(10_000_000, 0.01), | ||
]() | ||
print("num_hash_funcs=",bloom.num_hash_funcs(10_000_000, 0.01)) | ||
print("filter_size_total_bits=", bloom.filter_size(10_000_000, 0.01)*64) | ||
|
||
|
||
for i in range(0, 10_000_000 * 2, 2): | ||
bf.add(i + 0.5) | ||
|
||
for i in range(0, 10_000_000 * 2, 2): | ||
testing.assert_true(i + 0.5 in bf, "did not find value when expected:" + str(i+0.5)) | ||
|
||
var trues = 0 | ||
for i in range(1, 10_000_000 * 2, 2): | ||
if i+0.5 in bf: | ||
trues += 1 | ||
testing.assert_true(trues / 10_000_000 < 0.10, "found too many false positives: " + str(trues)) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|