Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add stateless dictionary support #216

Merged
merged 5 commits into from
Feb 4, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions flate/flate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,31 @@ func TestRegressions(t *testing.T) {
}
})
}
t.Run(tt.Name+"stateless", func(t *testing.T) {
// Split into two and use history...
buf := new(bytes.Buffer)
err = StatelessDeflate(buf, data1[:len(data1)/2], false)
if err != nil {
t.Error(err)
}

// Use top half as dictionary...
dict := data1[:len(data1)/2]
err = StatelessDeflate(buf, data1[len(data1)/2:], true, dict...)
if err != nil {
t.Error(err)
}
t.Log(buf.Len())
fr1 := NewReader(buf)
data2, err := ioutil.ReadAll(fr1)
if err != nil {
t.Error(err)
}
if bytes.Compare(data1, data2) != 0 {
fmt.Printf("want:%x\ngot: %x\n", data1, data2)
t.Error("not equal")
}
})
}
}

Expand Down
9 changes: 7 additions & 2 deletions flate/huffman_bit_writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,11 @@ func (w *huffmanBitWriter) flush() {
w.nbits = 0
return
}
if w.lastHeader > 0 {
// We owe an EOB
w.writeCode(w.literalEncoding.codes[endBlockMarker])
w.lastHeader = 0
}
n := w.nbytes
for w.nbits != 0 {
w.bytes[n] = byte(w.bits)
Expand Down Expand Up @@ -594,8 +599,8 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
tokens.AddEOB()
}

// We cannot reuse pure huffman table.
if w.lastHuffMan && w.lastHeader > 0 {
// We cannot reuse pure huffman table, and must mark as EOF.
if (w.lastHuffMan || eof) && w.lastHeader > 0 {
// We will not try to reuse.
w.writeCode(w.literalEncoding.codes[endBlockMarker])
w.lastHeader = 0
Expand Down
60 changes: 44 additions & 16 deletions flate/stateless.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import (

const (
maxStatelessBlock = math.MaxInt16
// dictionary will be taken from maxStatelessBlock, so limit it.
maxStatelessDict = 8 << 10

slTableBits = 13
slTableSize = 1 << slTableBits
Expand Down Expand Up @@ -59,7 +61,7 @@ var bitWriterPool = sync.Pool{

// StatelessDeflate allows to compress directly to a Writer without retaining state.
// When returning everything will be flushed.
func StatelessDeflate(out io.Writer, in []byte, eof bool) error {
func StatelessDeflate(out io.Writer, in []byte, eof bool, dict ...byte) error {
klauspost marked this conversation as resolved.
Show resolved Hide resolved
var dst tokens
bw := bitWriterPool.Get().(*huffmanBitWriter)
bw.reset(out)
Expand All @@ -76,35 +78,53 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool) error {
return bw.err
}

// Truncate dict
if len(dict) > maxStatelessDict {
dict = dict[len(dict)-maxStatelessDict:]
}

for len(in) > 0 {
todo := in
if len(todo) > maxStatelessBlock {
todo = todo[:maxStatelessBlock]
if len(todo) > maxStatelessBlock-len(dict) {
todo = todo[:maxStatelessBlock-len(dict)]
}
in = in[len(todo):]
uncompressed := todo
if len(dict) > 0 {
// combine dict and source
bufLen := len(todo) + len(dict)
combined := make([]byte, bufLen)
copy(combined, dict)
copy(combined[len(dict):], todo)
todo = combined
}
// Compress
statelessEnc(&dst, todo)
statelessEnc(&dst, todo, int16(len(dict)))
isEof := eof && len(in) == 0

if dst.n == 0 {
bw.writeStoredHeader(len(todo), isEof)
bw.writeStoredHeader(len(uncompressed), isEof)
if bw.err != nil {
return bw.err
}
bw.writeBytes(todo)
} else if int(dst.n) > len(todo)-len(todo)>>4 {
bw.writeBytes(uncompressed)
} else if int(dst.n) > len(uncompressed)-len(uncompressed)>>4 {
// If we removed less than 1/16th, huffman compress the block.
bw.writeBlockHuff(isEof, todo, false)
bw.writeBlockHuff(isEof, uncompressed, len(in) == 0)
} else {
bw.writeBlockDynamic(&dst, isEof, todo, false)
bw.writeBlockDynamic(&dst, isEof, uncompressed, len(in) == 0)
}
if len(in) > 0 {
// Retain a dict if we have more
dict = todo[len(todo)-maxStatelessDict:]
dst.Reset()
}
if bw.err != nil {
return bw.err
}
dst.Reset()
}
if !eof {
// Align.
// Align, only a stored block can do that.
bw.writeStoredHeader(0, false)
}
bw.flush()
Expand All @@ -130,7 +150,7 @@ func load6416(b []byte, i int16) uint64 {
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
}

func statelessEnc(dst *tokens, src []byte) {
func statelessEnc(dst *tokens, src []byte, startAt int16) {
const (
inputMargin = 12 - 1
minNonLiteralBlockSize = 1 + 1 + inputMargin
Expand All @@ -144,15 +164,23 @@ func statelessEnc(dst *tokens, src []byte) {

// This check isn't in the Snappy implementation, but there, the caller
// instead of the callee handles this case.
if len(src) < minNonLiteralBlockSize {
if len(src)-int(startAt) < minNonLiteralBlockSize {
// We do not fill the token table.
// This will be picked up by caller.
dst.n = uint16(len(src))
dst.n = 0
return
}
// Index until startAt
if startAt > 0 {
cv := load3232(src, 0)
for i := int16(0); i < startAt; i++ {
table[hashSL(cv)] = tableEntry{offset: i}
cv = (cv >> 8) | (uint32(src[i+4]) << 24)
}
}

s := int16(1)
nextEmit := int16(0)
s := startAt + 1
nextEmit := startAt
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
Expand Down