Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roaring: implement ToDense and FromDense #408

Merged
merged 11 commits into from
Dec 18, 2023
162 changes: 162 additions & 0 deletions roaring.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,168 @@
return rb.highlowcontainer.toBytes()
}

const wordSize = uint64(64)
const log2WordSize = uint64(6)
const capacity = ^uint64(0)
const bitmapContainerSize = (1 << 16) / 64 // bitmap size in words

// DenseSize returns the size of the bitmap when stored as a dense bitmap.
func (rb *Bitmap) DenseSize() int {
if rb.highlowcontainer.size() == 0 {
return 0
}

maximum := 1 + uint64(rb.Maximum())
if maximum > (capacity - wordSize + 1) {
return int(capacity >> log2WordSize)

Check failure on line 69 in roaring.go

View workflow job for this annotation

GitHub Actions / test (1.14.x, ubuntu-latest)

constant 288230376151711743 overflows int

Check failure on line 69 in roaring.go

View workflow job for this annotation

GitHub Actions / test (1.17.x, ubuntu-latest)

constant 288230376151711743 overflows int
}

return int((maximum + (wordSize - 1)) >> log2WordSize)
}

// ToDense returns a slice of uint64s representing the bitmap as a dense bitmap.
// Useful to convert a roaring bitmap to a format that can be used by other libraries
// like https://github.com/bits-and-blooms/bitset or https://github.com/kelindar/bitmap
func (rb *Bitmap) ToDense() []uint64 {
sz := rb.DenseSize()
if sz == 0 {
return nil
}

bitmap := make([]uint64, sz)
rb.WriteDenseTo(bitmap)
return bitmap
}

// FromDense creates a bitmap from a slice of uint64s representing the bitmap as a dense bitmap.
// Useful to convert bitmaps from libraries like https://github.com/bits-and-blooms/bitset or
// https://github.com/kelindar/bitmap into roaring bitmaps fast and with convenience.
//
// This function won't create any run containers, only array and bitmap containers. It's up to
// the caller to call RunOptimize if they want to further compress the runs of consecutive values.
//
// When doCopy is true, the bitmap is copied into a new slice for each bitmap container.
// This is useful when the bitmap is going to be modified after this function returns or if it's
// undesirable to hold references to large bitmaps which the GC wouldn't be able to collect.
// One copy can still happen even when doCopy is false if the bitmap length isn't divisible by bitmapContainerSize.
func FromDense(bitmap []uint64, doCopy bool) *Bitmap {
sz := (len(bitmap) + bitmapContainerSize - 1) / bitmapContainerSize // round up
rb := &Bitmap{
highlowcontainer: roaringArray{
containers: make([]container, 0, sz),
keys: make([]uint16, 0, sz),
needCopyOnWrite: make([]bool, 0, sz),
},
}
rb.FromDense(bitmap, doCopy)
return rb
}

// FromDense unmarshalls from a slice of uint64s representing the bitmap as a dense bitmap.
// Useful to convert bitmaps from libraries like https://github.com/bits-and-blooms/bitset or
// https://github.com/kelindar/bitmap into roaring bitmaps fast and with convenience.
// Callers are responsible for ensuring that the bitmap is empty before calling this function.
//
// This function won't create any run containers, only array and bitmap containers. It's up to
// the caller to call RunOptimize if they want to further compress the runs of consecutive values.
//
// When doCopy is true, the bitmap is copied into a new slice for each bitmap container.
// This is useful when the bitmap is going to be modified after this function returns or if it's
// undesirable to hold references to large bitmaps which the GC wouldn't be able to collect.
// One copy can still happen even when doCopy is false if the bitmap length isn't divisible by bitmapContainerSize.
func (rb *Bitmap) FromDense(bitmap []uint64, doCopy bool) {
if len(bitmap) == 0 {
return
}

var k uint16
const size = bitmapContainerSize

for len(bitmap) > 0 {
hi := size
if len(bitmap) < size {
hi = len(bitmap)
}

words := bitmap[:hi]
count := int(popcntSlice(words))

switch {
case count > arrayDefaultMaxSize:
c := &bitmapContainer{cardinality: count, bitmap: words}
cow := true

if doCopy || len(words) < size {
c.bitmap = make([]uint64, size)
copy(c.bitmap, words)
cow = false
}

rb.highlowcontainer.appendContainer(k, c, cow)
tsenart marked this conversation as resolved.
Show resolved Hide resolved

case count > 0:
c := &arrayContainer{content: make([]uint16, count)}
var pos, base int
for _, w := range words {
for w != 0 {
t := w & -w
c.content[pos] = uint16(base + int(popcount(t-1)))
pos++
w ^= t
}
base += 64
}
rb.highlowcontainer.appendContainer(k, c, false)
}

bitmap = bitmap[hi:]
k++
}
}

// WriteDenseTo writes to a slice of uint64s representing the bitmap as a dense bitmap.
// Callers are responsible for allocating enough space in the bitmap using DenseSize.
// Useful to convert a roaring bitmap to a format that can be used by other libraries
// like https://github.com/bits-and-blooms/bitset or https://github.com/kelindar/bitmap
func (rb *Bitmap) WriteDenseTo(bitmap []uint64) {
for i, ct := range rb.highlowcontainer.containers {
hb := uint32(rb.highlowcontainer.keys[i]) << 16

switch c := ct.(type) {
case *arrayContainer:
for _, x := range c.content {
n := int(hb | uint32(x))
bitmap[n>>log2WordSize] |= uint64(1) << uint(x%64)
}

case *bitmapContainer:
copy(bitmap[int(hb)>>log2WordSize:], c.bitmap)

case *runContainer16:
for j := range c.iv {
start := uint32(c.iv[j].start)
end := start + uint32(c.iv[j].length) + 1
lo := int(hb|start) >> log2WordSize
hi := int(hb|(end-1)) >> log2WordSize

if lo == hi {
bitmap[lo] |= (^uint64(0) << uint(start%64)) &
(^uint64(0) >> (uint(-end) % 64))
continue
}

bitmap[lo] |= ^uint64(0) << uint(start%64)
for n := lo + 1; n < hi; n++ {
bitmap[n] = ^uint64(0)
}
bitmap[hi] |= ^uint64(0) >> (uint(-end) % 64)
}
default:
panic("unsupported container type")
}
}
}

// Checksum computes a hash (currently FNV-1a) for a bitmap that is suitable for
// using bitmaps as elements in hash sets or as keys in hash maps, as well as
// generally quicker comparisons.
Expand Down
99 changes: 99 additions & 0 deletions roaring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package roaring

import (
"bytes"
"fmt"
"math"
"math/rand"
"strconv"
Expand Down Expand Up @@ -2543,6 +2544,104 @@ func TestIterateHalt(t *testing.T) {
assert.Equal(t, expected, values)
}

func testDense(fn func(string, *Bitmap)) {
bc := New()
for i := 0; i <= arrayDefaultMaxSize; i++ {
bc.Add(uint32(1 + MaxUint16 + i*2))
}

rc := New()
rc.AddRange(1, 2)
rc.AddRange(bc.GetCardinality(), bc.GetCardinality()*2)

ac := New()
for i := 1; i <= arrayDefaultMaxSize; i++ {
ac.Add(uint32(MaxUint16 + i*2))
}

brc := New()
for i := 150000; i < 450000; i++ {
brc.Add(uint32(i))
}

for _, tc := range []struct {
name string
rb *Bitmap
}{
{"bitmap", bc},
{"run", rc},
{"array", ac},
{"bitmaps-and-runs", brc},
} {
fn(tc.name+"-"+strconv.FormatUint(tc.rb.GetCardinality(), 10), tc.rb)
}
}

func TestToDense(t *testing.T) {
testDense(func(name string, rb *Bitmap) {
t.Run(name, func(t *testing.T) {
bm := bitset.From(rb.ToDense())
assert.EqualValues(t, rb.GetCardinality(), uint64(bm.Count()))
rb.Iterate(func(x uint32) bool {
return assert.True(t, bm.Test(uint(x)), "value %d should be set", x)
})
})
})
}

func TestFromDense(t *testing.T) {
testDense(func(name string, rb *Bitmap) {
for _, doCopy := range []bool{false, true} {
t.Run(fmt.Sprintf("%s,doCopy=%t", name, doCopy), func(t *testing.T) {
dense := rb.ToDense()
cp := FromDense(dense, doCopy)
if doCopy {
// Clear the original dense slice to ensure we don't have any
// references to it
for i := range dense {
dense[i] = 0
}
}
assert.True(t, rb.Equals(cp))
})
}
})
}

func BenchmarkFromDense(b *testing.B) {
testDense(func(name string, rb *Bitmap) {
dense := make([]uint64, rb.DenseSize())
rb.WriteDenseTo(dense)
cp := FromDense(dense, false)

for _, doCopy := range []bool{false, true} {
b.Run(fmt.Sprintf("%s,doCopy=%t", name, doCopy), func(b *testing.B) {
b.ReportAllocs()
b.SetBytes(int64(len(dense) * 8))
b.ResetTimer()
for i := 0; i < b.N; i++ {
cp.FromDense(dense, doCopy)
cp.Clear()
}
})
}
})
}

func BenchmarkWriteDenseTo(b *testing.B) {
testDense(func(name string, rb *Bitmap) {
b.Run(name, func(b *testing.B) {
dense := make([]uint64, rb.DenseSize())
b.ReportAllocs()
b.SetBytes(int64(len(dense) * 8))
b.ResetTimer()
for i := 0; i < b.N; i++ {
rb.WriteDenseTo(dense)
}
})
})
}

func BenchmarkEvenIntervalArrayUnions(b *testing.B) {
inputBitmaps := make([]*Bitmap, 40)
for i := 0; i < 40; i++ {
Expand Down
Loading