Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create a lot simpler sorted uint64 codec #2716

Merged
merged 4 commits into from
Oct 31, 2018
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions binary/codec.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package main

import (
"github.com/dgraph-io/dgraph/protos/pb"
)

func packBlock(uids []uint64) *pb.UidBlock {
manishrjain marked this conversation as resolved.
Show resolved Hide resolved
if len(uids) == 0 {
return nil
}
block := &pb.UidBlock{Base: uids[0]}
last := uids[0]
for _, uid := range uids[1:] {
block.Deltas = append(block.Deltas, uid-last)
last = uid
}
return block
}

// Encode takes in a list of uids and a block size. It would pack these uids into blocks of the
// given size, with the last block having fewer uids. Within each block, it stores the first uid as
// base. For each next uid, a delta = uids[i] - uids[i-1] is stored. Protobuf uses Varint encoding,
// as mentioned here: https://developers.google.com/protocol-buffers/docs/encoding . This ensures
// that the deltas being considerably smaller than the original uids are nicely packed in fewer
// bytes. Our benchmarks on artificial data show compressed size to be 13% of the original. This
// mechanism is a LOT simpler to understand and if needed, debug.
func Encode(uids []uint64, blockSize int) pb.UidPack {
manishrjain marked this conversation as resolved.
Show resolved Hide resolved
pack := pb.UidPack{BlockSize: uint32(blockSize)}
for {
if len(uids) <= blockSize {
block := packBlock(uids)
pack.Blocks = append(pack.Blocks, block)
return pack
} else {
block := packBlock(uids[:blockSize])
pack.Blocks = append(pack.Blocks, block)
uids = uids[blockSize:] // Advance.
}
}
}

// NumUids returns the number of uids stored in a UidPack.
func NumUids(pack pb.UidPack) int {
manishrjain marked this conversation as resolved.
Show resolved Hide resolved
sz := len(pack.Blocks)
lastBlock := pack.Blocks[sz-1]
return (sz-1)*int(pack.BlockSize) + len(lastBlock.Deltas) + 1 // We don't store base in deltas.
}

// Decode decodes the UidPack back into the list of uids. This is a stop-gap function, Decode would
// need to do more specific things than just return the list back.
func Decode(pack pb.UidPack) []uint64 {
manishrjain marked this conversation as resolved.
Show resolved Hide resolved
uids := make([]uint64, NumUids(pack))
uids = uids[:0]

for _, block := range pack.Blocks {
last := block.Base
uids = append(uids, last)
for _, delta := range block.Deltas {
uid := last + delta
uids = append(uids, uid)
last = uid
}
}
return uids
}
131 changes: 131 additions & 0 deletions binary/codec_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
package main

import (
"bytes"
"compress/gzip"
"encoding/binary"
"math/rand"
"testing"
"time"

"github.com/dgraph-io/dgraph/x"
humanize "github.com/dustin/go-humanize"
"github.com/stretchr/testify/require"
)

func getUids(size int) []uint64 {
var uids []uint64
last := uint64(rand.Intn(100))
uids = append(uids, last)
for i := 1; i < size; i++ {
last += uint64(rand.Intn(33))
uids = append(uids, last)
}
return uids
}

func TestUidPack(t *testing.T) {
rand.Seed(time.Now().UnixNano())

for i := 0; i < 13; i++ {
size := rand.Intn(10e6)
if size < 0 {
size = 1e6
}
t.Logf("Testing with size = %d", size)

expected := getUids(size)
pack := Encode(expected, 256)
for _, block := range pack.Blocks {
require.True(t, len(block.Deltas) <= 255)
}
require.Equal(t, len(expected), NumUids(pack))
actual := Decode(pack)
require.Equal(t, expected, actual)
}
}

func BenchmarkGzip(b *testing.B) {
rand.Seed(time.Now().UnixNano())

uids := getUids(1e6)
b.ResetTimer()
sz := uint64(len(uids)) * 8

b.Logf("Dataset Len=%d. Size: %s", len(uids), humanize.Bytes(sz))
var data []byte
for i := 0; i < b.N; i++ {
tmp := make([]byte, binary.MaxVarintLen64)
var buf bytes.Buffer
for _, uid := range uids {
n := binary.PutUvarint(tmp, uid)
_, err := buf.Write(tmp[:n])
x.Check(err)
}

var out bytes.Buffer
zw := gzip.NewWriter(&out)
_, err := zw.Write(buf.Bytes())
x.Check(err)

data = out.Bytes()
}
b.Logf("Output size: %s. Compression: %.2f",
humanize.Bytes(uint64(len(data))),
float64(len(data))/float64(sz))
}

func benchmarkUidPackEncode(b *testing.B, blockSize int) {
rand.Seed(time.Now().UnixNano())

uids := getUids(1e6)
sz := uint64(len(uids)) * 8
b.Logf("Dataset Len=%d. Size: %s", len(uids), humanize.Bytes(sz))
b.ResetTimer()

var data []byte
for i := 0; i < b.N; i++ {
pack := Encode(uids, blockSize)
out, err := pack.Marshal()
x.Check(err)
data = out
}
b.Logf("Output size: %s. Compression: %.2f",
humanize.Bytes(uint64(len(data))),
float64(len(data))/float64(sz))
}

func BenchmarkUidPack(b *testing.B) {
b.Run("encode/128", func(b *testing.B) {
benchmarkUidPackEncode(b, 128)
})
b.Run("encode/256", func(b *testing.B) {
benchmarkUidPackEncode(b, 256)
})
b.Run("decode/128", func(b *testing.B) {
benchmarkUidPackDecode(b, 128)
})
b.Run("decode/256", func(b *testing.B) {
benchmarkUidPackDecode(b, 256)
})
}

func benchmarkUidPackDecode(b *testing.B, blockSize int) {
rand.Seed(time.Now().UnixNano())

uids := getUids(1e6)
sz := uint64(len(uids)) * 8
b.Logf("Dataset Len=%d. Size: %s", len(uids), humanize.Bytes(sz))

pack := Encode(uids, blockSize)
data, err := pack.Marshal()
x.Check(err)
b.Logf("Output size: %s. Compression: %.2f",
humanize.Bytes(uint64(len(data))),
float64(len(data))/float64(sz))

b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = Decode(pack)
}
}
10 changes: 10 additions & 0 deletions protos/pb.proto
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,16 @@ message Posting {
uint64 commit_ts = 14; // Meant to use only inmemory
}

message UidBlock {
uint64 base = 1;
repeated uint64 deltas = 2;
}

message UidPack {
uint32 block_size = 1;
repeated UidBlock blocks = 2;
}

message PostingList {
repeated Posting postings = 1;
bytes checksum = 2;
Expand Down
Loading