Skip to content

Commit

Permalink
initial
Browse files Browse the repository at this point in the history
  • Loading branch information
DontNeedGithubAccount committed Apr 4, 2024
0 parents commit 1e799b1
Show file tree
Hide file tree
Showing 7 changed files with 294 additions and 0 deletions.
48 changes: 48 additions & 0 deletions .github/workflows/test_and_bench.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: Test and Run

on:
pull_request:
push:
workflow_dispatch:

jobs:
test-examples:
runs-on: ubuntu-latest
defaults:
run:
shell: bash
env:
DEBIAN_FRONTEND: noninteractive
LLVM_VERSION: 17

steps:
- name: Checkout repo
uses: actions/checkout@v4

- name: "Setup conda env (base)"
uses: conda-incubator/setup-miniconda@v2
with:
python-version: 3.11
auto-activate-base: true

- name: "Install mojo"
run: |
curl https://get.modular.com | sh - && \
modular auth ${{secrets.MODULAR_AUTH}} && \
modular install --install-version 24.2.0 mojo
- name: "Setup conda env"
uses: conda-incubator/setup-miniconda@v2
with:
python-version: 3.11
activate-environment: base

- name: "Install pip deps"
run: pip install rbloom

- name: "Run main"
run: |
export MODULAR_HOME="/home/runner/.modular"
export PATH="/home/runner/.modular/pkg/packages.modular.com_mojo/bin:$PATH"
export MOJO_PYTHON_LIBRARY="$(find $CONDA_PREFIX/lib -iname 'libpython*.[s,d]*' | sort -r | head -n 1)"
mojo run main.🔥
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2024 Alan Derk

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
50 changes: 50 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Fireweed
A toy project of trying out Mojo🔥 with a Bloom filter. Why a Bloom filter?
* Limited API
* Super fun w/ probability and bit operations
* Seemingly does the impossible measuring containment w/o storing more than several bits per item
* Useful
* Extendable to Cuckoo, kMinHash, HyperLogLog(++), Counting Bloom Filters, etc.

This project borrows heavily from https://github.com/KenanHanke/rbloom, which is much more production-worthy and better commented/robust/tested.

# Installation And Running
1. Install Mojo
2. Install python (probably via miniconda)
3. Set MOJO_PYTHON_LIBRARY
4. pip install rbloom
5. mojo run main.🔥

# Benchmark Results
Benchmarks always have some unfairness, so take these with a grain of salt. This compares python with a rust library to mojo with a mojo library, which is very much apples-to-oranges.

```
alan@mbalan ~/C/fireweed (main) [1]> mojo run main.🔥 (py310)
===rbloom/python results===
bf=<Bloom size_in_bits=95850584 approx_items=0.0>
add_time=0.9502909183502197
total_time=1.6721301078796387
num_hash_funcs= 6
filter_size_total_bits= 95850624
===mojo results===
---------------------
Benchmark Report (s)
---------------------
Mean: 0.61287924999999999
Total: 2.4515169999999999
Iters: 4
Warmup Mean: 0.61581649999999999
Warmup Total: 1.231633
Warmup Iters: 2
Fastest Mean: 0.61287924999999999
Slowest Mean: 0.61287924999999999
```

# Future work
* Investigate TODOs/HACKs/MUSINGs
* Currently it scales very poorly above ~100k elements on an M2 MBA, not sure what's going on there.
* Performance profiling
* Write a real test suite

Binary file added __pycache__/bloompy.cpython-311.pyc
Binary file not shown.
88 changes: 88 additions & 0 deletions bloom.🔥
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import math
from collections.vector import InlinedFixedVector

struct Bloom[filter_size_64bits: Int, num_hashes: Int]():
"""Implements a bloom filter with addition and probabilistic containment.

This provides small space, only true negatives for containment, and some
false positives.

Parameters:
filter_size_64bits: Internal state size in 64bit increments.
num_hashes: Number of buckets/indicies in the filter to fill per addition.
"""

# TODO: aliases cannot access aliases
# TODO: Add aliases/params for chunk_type, simdwidth, chunk_size
var _state: DTypePointer[DType.uint64]

fn __init__(inout self):
# TODO: Add destructor or determine not necessary.
self._state = self._state.alloc(filter_size_64bits)
memset_zero(self._state, filter_size_64bits)

fn __del__(owned self):
self._state.free()

@always_inline
fn add[T: Hashable](inout self, el: T):
"""Adds an element to the bloom filter."""
var idxs = self._generate_indicies(el)
@unroll(4)
for i in range(num_hashes):
var bit_loc = idxs[i]
var filter_chunk = bit_loc // 64
var loc_within_chunk = bit_loc % 64
self._state[filter_chunk.to_int()] |= ( 1 << loc_within_chunk)

@always_inline
fn __contains__[T: Hashable](self, el: T) -> Bool:
"""Returns containment status w/ false positives."""
var idxs = self._generate_indicies(el)
@unroll(4)
for i in range(num_hashes):
var bit_loc = idxs[i]
var filter_chunk = bit_loc // 64
var loc_within_chunk = bit_loc % 64
if (self._state[filter_chunk.to_int()] & (UInt64(1) << loc_within_chunk)) == UInt64(0):
return False
return True

@always_inline
fn _generate_indicies[T: Hashable](self, el: T) -> InlinedFixedVector[UInt64, num_hashes]:
# Generates num_hashes indicies between [0,64*filter_size).
# Linear Congruential Generator params taken from https://nuclear.llnl.gov/CNP/rng/rngman/node4.html
var a = 2862933555777941757
var b = 3037000493
var ret = InlinedFixedVector[UInt64, num_hashes](num_hashes)
var entropy: UInt64 = hash(el)
@unroll(4)
for i in range(num_hashes):
# MUSING: I think I can use __setitem__ but it won't change the size of ret.
ret[i] = (entropy % (filter_size_64bits*64)).to_int()
entropy = a * entropy + b
return ret


# TODO: Move these fn's to a class method `from_expectation`. It proved
# difficult to have an unbound return type on a class method.
@parameter
fn filter_size(expected_items: Int, false_pos_rate: Float64) -> Int:
# Calculates how many bits to use in the filter. Rounds up in simd_width increments.
# TODO: Add citation for formula
# TODO: Why is math.log() so hard to resolve types for!?!
var log2 = math.log(Float64(2.0))
var raw_bit_length = -1 * (expected_items * math.log(false_pos_rate) / log2**2).to_int()
var ret = raw_bit_length // 64 + 1
return ret

@parameter
fn num_hash_funcs(expected_items: Int, false_pos_rate: Float64) -> Int:
# Number of bits to set, each with a different hash func, commonly called `k`.
# TODO: Add citation for formula.
# HACK: Use SIMD type by rounding to powers of 2.
var filter_size_64bits = filter_size(expected_items, false_pos_rate)
var log2 = math.log(Float64(2.0))
var ret = (filter_size_64bits * 64 / expected_items) * log2
return ret.to_int()

17 changes: 17 additions & 0 deletions bloompy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import rbloom
import time

def run_python_benchmark(size):
bf = rbloom.Bloom(size, 0.01)
print(f"{bf=}")
start = time.time()
for i in range(size):
bf.add(i + 0.5)

add_time = time.time() - start
for i in range(size):
assert i + 0.5 in bf

total_time = time.time() - start

print(f"{add_time=}\n{total_time=}\n\n", flush=True)
70 changes: 70 additions & 0 deletions main.🔥
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from python import Python
import bloom
import benchmark
import time
import testing

# TODO: Get ELEMENTS to work as a parameter, alias ELEMENTS = 10_000_000
# Must edit on lines ~29-30.

fn main() raises:
run_python()
test_mojo()
run_mojo()


fn run_python() raises:
print("===rbloom/python results===")
Python.add_to_path(".")
var bloompy = Python.import_module("bloompy")
bloompy.run_python_benchmark(10_000_000)

fn run_mojo():
print("===mojo results===")
var report = benchmark.run[bloom_mojo_single_run]()
report.print()

fn bloom_mojo_single_run():
var bf = bloom.Bloom[
bloom.filter_size(10_000_000, 0.01),
bloom.num_hash_funcs(10_000_000, 0.01),
]()

for i in range(10_000_000):
# match python impl to avoid special-case int hashing.
bf.add(i + 0.5)

for i in range(10_000_000):
var _unused = i + 0.5 in bf
benchmark.keep(_unused)


fn test_mojo() raises:
var bf = bloom.Bloom[
bloom.filter_size(10_000_000, 0.01),
bloom.num_hash_funcs(10_000_000, 0.01),
]()
print("num_hash_funcs=",bloom.num_hash_funcs(10_000_000, 0.01))
print("filter_size_total_bits=", bloom.filter_size(10_000_000, 0.01)*64)


for i in range(0, 10_000_000 * 2, 2):
bf.add(i + 0.5)

for i in range(0, 10_000_000 * 2, 2):
testing.assert_true(i + 0.5 in bf, "did not find value when expected:" + str(i+0.5))

var trues = 0
for i in range(1, 10_000_000 * 2, 2):
if i+0.5 in bf:
trues += 1
testing.assert_true(trues / 10_000_000 < 0.10, "found too many false positives: " + str(trues))









0 comments on commit 1e799b1

Please sign in to comment.