diff --git a/bitwriter.go b/bitwriter.go index 89af705..dfc6036 100644 --- a/bitwriter.go +++ b/bitwriter.go @@ -1,7 +1,5 @@ package brotli -import "github.com/andybalholm/brotli/matchfinder" - /* Copyright 2010 Google Inc. All Rights Reserved. Distributed under MIT license. @@ -56,7 +54,3 @@ func (w *bitWriter) jumpToByteBoundary() { w.bits = 0 w.dst = dst } - -func matchScore(m matchfinder.AbsoluteMatch) int { - return int(backwardReferenceScore(uint(m.End-m.Start), uint(m.Start-m.Match))) -} diff --git a/brotli_test.go b/brotli_test.go index fd4f1f1..108ec07 100644 --- a/brotli_test.go +++ b/brotli_test.go @@ -651,69 +651,69 @@ func benchmark(b *testing.B, filename string, m matchfinder.MatchFinder, blockSi } func TestEncodeM4(t *testing.T) { - test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 18, Score: matchScore}, 1<<16) + test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 18, DistanceBitCost: 57}, 1<<16) } func BenchmarkEncodeM4(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, Score: matchScore}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, DistanceBitCost: 57}, 1<<16) } func TestEncodeM4Chain1(t *testing.T) { - test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 18, ChainLength: 1, Score: matchScore}, 1<<16) + test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 18, ChainLength: 1, DistanceBitCost: 57}, 1<<16) } func BenchmarkEncodeM4Chain1(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 1, Score: matchScore}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 1, DistanceBitCost: 57}, 1<<16) } func BenchmarkEncodeM4Chain2(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 2, Score: matchScore}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 2, DistanceBitCost: 57}, 1<<16) } func BenchmarkEncodeM4Chain4(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 4, Score: matchScore}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 4, DistanceBitCost: 57}, 1<<16) } func BenchmarkEncodeM4Chain8(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 8, HashLen: 5, Score: matchScore}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 8, HashLen: 5, DistanceBitCost: 57}, 1<<16) } func BenchmarkEncodeM4Chain16(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 16, HashLen: 5, Score: matchScore}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 16, HashLen: 5, DistanceBitCost: 57}, 1<<16) } func BenchmarkEncodeM4Chain32(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 32, HashLen: 5, Score: matchScore}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 32, HashLen: 5, DistanceBitCost: 57}, 1<<16) } func BenchmarkEncodeM4Chain64(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 64, HashLen: 5, Score: matchScore}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 64, HashLen: 5, DistanceBitCost: 57}, 1<<16) } func BenchmarkEncodeM4Chain128(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 128, HashLen: 5, Score: matchScore}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 128, HashLen: 5, DistanceBitCost: 57}, 1<<16) } func TestEncodeMultiHash6(t *testing.T) { - test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 18, Score: matchScore, HashLengths: []int{6}}, 1<<16) + test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 18, DistanceBitCost: 57, HashLengths: []int{6}}, 1<<16) } func TestEncodeMultiHash6_8(t *testing.T) { - test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 18, Score: matchScore, HashLengths: []int{6, 8}}, 1<<16) + test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 18, DistanceBitCost: 57, HashLengths: []int{6, 8}}, 1<<16) } func BenchmarkEncodeMultiHash6(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, Score: matchScore, HashLengths: []int{6}}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{6}}, 1<<16) } func BenchmarkEncodeMultiHash5_8(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, Score: matchScore, HashLengths: []int{5, 8}}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{5, 8}}, 1<<16) } func BenchmarkEncodeMultiHash5_7_9(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, Score: matchScore, HashLengths: []int{5, 7, 9}}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{5, 7, 9}}, 1<<16) } func BenchmarkEncodeMultiHash5_6_7_9(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, Score: matchScore, HashLengths: []int{5, 6, 7, 9}}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{5, 6, 7, 9}}, 1<<16) } diff --git a/matchfinder/emitter.go b/matchfinder/emitter.go index b80659f..37ed8e1 100644 --- a/matchfinder/emitter.go +++ b/matchfinder/emitter.go @@ -1,8 +1,8 @@ package matchfinder -// An AbsoluteMatch is like a Match, but it stores indexes into the byte +// An absoluteMatch is like a Match, but it stores indexes into the byte // stream instead of lengths. -type AbsoluteMatch struct { +type absoluteMatch struct { // Start is the index of the first byte. Start int @@ -24,7 +24,7 @@ type matchEmitter struct { NextEmit int } -func (e *matchEmitter) emit(m AbsoluteMatch) { +func (e *matchEmitter) emit(m absoluteMatch) { e.Dst = append(e.Dst, Match{ Unmatched: m.Start - e.NextEmit, Length: m.End - m.Start, @@ -35,7 +35,7 @@ func (e *matchEmitter) emit(m AbsoluteMatch) { // trim shortens m if it extends past maxEnd. Then if the length is at least // minLength, the match is emitted. -func (e *matchEmitter) trim(m AbsoluteMatch, maxEnd int, minLength int) { +func (e *matchEmitter) trim(m absoluteMatch, maxEnd int, minLength int) { if m.End > maxEnd { m.End = maxEnd } diff --git a/matchfinder/m4.go b/matchfinder/m4.go index 611a475..a1fc824 100644 --- a/matchfinder/m4.go +++ b/matchfinder/m4.go @@ -32,9 +32,14 @@ type M4 struct { // locations with the same hash as the current location. ChainLength int - // Score is the rating function used to choose the best match. - // The default is the length of the match. - Score func(AbsoluteMatch) int + // DistanceBitCost is used when comparing two matches to see + // which is better. The comparison is primarily based on the length + // of the matches, but it can also take the distance into account, + // in terms of the number of bits needed to represent the distance. + // One byte of length is given a score of 256, so 32 (256/8) would + // be a reasonable first guess for the value of one bit. + // (The default is 0, which bases the comparison solely on length.) + DistanceBitCost int table []uint32 chain []uint16 @@ -50,6 +55,10 @@ func (q *M4) Reset() { q.chain = q.chain[:0] } +func (q *M4) score(m absoluteMatch) int { + return (m.End-m.Start)*256 + bits.LeadingZeros32(uint32(m.Start-m.Match))*q.DistanceBitCost +} + func (q *M4) FindMatches(dst []Match, src []byte) []Match { if q.MaxDistance == 0 { q.MaxDistance = 65535 @@ -66,11 +75,6 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { if len(q.table) < 1<= matches[0].End { + if matches[0] != (absoluteMatch{}) && i >= matches[0].End { // We have found some matches, and we're far enough along that we probably // won't find overlapping matches, so we might as well emit them. - if matches[1] != (AbsoluteMatch{}) { + if matches[1] != (absoluteMatch{}) { e.trim(matches[1], matches[0].Start, q.MinLength) } e.emit(matches[0]) - matches = [3]AbsoluteMatch{} + matches = [3]absoluteMatch{} } // Calculate and store the hash. @@ -133,7 +137,7 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { } // Look for a match. - var currentMatch AbsoluteMatch + var currentMatch absoluteMatch if i-candidate != matches[0].Start-matches[0].Match { if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) { @@ -156,24 +160,24 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { if i-candidate != matches[0].Start-matches[0].Match { if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) { m := extendMatch2(src, i, candidate, e.NextEmit) - if m.End-m.Start > q.MinLength && q.Score(m) > q.Score(currentMatch) { + if m.End-m.Start > q.MinLength && q.score(m) > q.score(currentMatch) { currentMatch = m } } } } - if q.Score(currentMatch) <= q.Score(matches[0]) { + if q.score(currentMatch) <= q.score(matches[0]) { continue } - matches = [3]AbsoluteMatch{ + matches = [3]absoluteMatch{ currentMatch, matches[0], matches[1], } - if matches[2] == (AbsoluteMatch{}) { + if matches[2] == (absoluteMatch{}) { continue } @@ -181,34 +185,34 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { switch { case matches[0].Start < matches[2].End: // The first and third matches overlap; discard the one in between. - matches = [3]AbsoluteMatch{ + matches = [3]absoluteMatch{ matches[0], matches[2], - AbsoluteMatch{}, + absoluteMatch{}, } case matches[0].Start < matches[2].End+q.MinLength: // The first and third matches don't overlap, but there's no room for // another match between them. Emit the first match and discard the second. e.emit(matches[2]) - matches = [3]AbsoluteMatch{ + matches = [3]absoluteMatch{ matches[0], - AbsoluteMatch{}, - AbsoluteMatch{}, + absoluteMatch{}, + absoluteMatch{}, } default: // Emit the first match, shortening it if necessary to avoid overlap with the second. e.trim(matches[2], matches[1].Start, q.MinLength) - matches[2] = AbsoluteMatch{} + matches[2] = absoluteMatch{} } } // We've found all the matches now; emit the remaining ones. - if matches[1] != (AbsoluteMatch{}) { + if matches[1] != (absoluteMatch{}) { e.trim(matches[1], matches[0].Start, q.MinLength) } - if matches[0] != (AbsoluteMatch{}) { + if matches[0] != (absoluteMatch{}) { e.emit(matches[0]) } @@ -265,13 +269,13 @@ func extendMatch(src []byte, i, j int) int { // Given a 4-byte match at src[start] and src[candidate], extendMatch2 extends it // upward as far as possible, and downward no farther than to min. -func extendMatch2(src []byte, start, candidate, min int) AbsoluteMatch { +func extendMatch2(src []byte, start, candidate, min int) absoluteMatch { end := extendMatch(src, candidate+4, start+4) for start > min && candidate > 0 && src[start-1] == src[candidate-1] { start-- candidate-- } - return AbsoluteMatch{ + return absoluteMatch{ Start: start, End: end, Match: candidate, diff --git a/matchfinder/multihash.go b/matchfinder/multihash.go index b33e58e..adc0b40 100644 --- a/matchfinder/multihash.go +++ b/matchfinder/multihash.go @@ -2,6 +2,7 @@ package matchfinder import ( "encoding/binary" + "math/bits" "sort" ) @@ -26,9 +27,14 @@ type MultiHash struct { // The default is 17 (128K entries). TableBits int - // Score is the rating function used to choose the best match. - // The default is the length of the match. - Score func(AbsoluteMatch) int + // DistanceBitCost is used when comparing two matches to see + // which is better. The comparison is primarily based on the length + // of the matches, but it can also take the distance into account, + // in terms of the number of bits needed to represent the distance. + // One byte of length is given a score of 256, so 32 (256/8) would + // be a reasonable first guess for the value of one bit. + // (The default is 0, which bases the comparison solely on length.) + DistanceBitCost int tables [][]uint32 @@ -44,6 +50,10 @@ func (q *MultiHash) Reset() { q.history = q.history[:0] } +func (q *MultiHash) score(m absoluteMatch) int { + return (m.End-m.Start)*256 + bits.LeadingZeros32(uint32(m.Start-m.Match))*q.DistanceBitCost +} + func (q *MultiHash) FindMatches(dst []Match, src []byte) []Match { if q.MaxDistance == 0 { q.MaxDistance = 65535 @@ -60,11 +70,6 @@ func (q *MultiHash) FindMatches(dst []Match, src []byte) []Match { q.tables[i] = make([]uint32, 1<= matches[0].End { + if matches[0] != (absoluteMatch{}) && i >= matches[0].End { // We have found some matches, and we're far enough along that we probably // won't find overlapping matches, so we might as well emit them. - if matches[1] != (AbsoluteMatch{}) { + if matches[1] != (absoluteMatch{}) { e.trim(matches[1], matches[0].Start, q.MinLength) } e.emit(matches[0]) - matches = [3]AbsoluteMatch{} + matches = [3]absoluteMatch{} } // Calculate and store the hashes. @@ -124,7 +129,7 @@ func (q *MultiHash) FindMatches(dst []Match, src []byte) []Match { } // Look for a match. - var currentMatch AbsoluteMatch + var currentMatch absoluteMatch if i < matches[0].End { // If we're looking for an overlapping match, we only need to check the @@ -161,23 +166,23 @@ func (q *MultiHash) FindMatches(dst []Match, src []byte) []Match { break } m := extendMatch2(src, i, candidate, e.NextEmit) - if m.End-m.Start > q.MinLength && q.Score(m) > q.Score(currentMatch) { + if m.End-m.Start > q.MinLength && q.score(m) > q.score(currentMatch) { currentMatch = m } } } - if currentMatch == (AbsoluteMatch{}) || q.Score(currentMatch) <= q.Score(matches[0]) { + if currentMatch == (absoluteMatch{}) || q.score(currentMatch) <= q.score(matches[0]) { continue } - matches = [3]AbsoluteMatch{ + matches = [3]absoluteMatch{ currentMatch, matches[0], matches[1], } - if matches[2] == (AbsoluteMatch{}) { + if matches[2] == (absoluteMatch{}) { continue } @@ -185,34 +190,34 @@ func (q *MultiHash) FindMatches(dst []Match, src []byte) []Match { switch { case matches[0].Start < matches[2].End: // The first and third matches overlap; discard the one in between. - matches = [3]AbsoluteMatch{ + matches = [3]absoluteMatch{ matches[0], matches[2], - AbsoluteMatch{}, + absoluteMatch{}, } case matches[0].Start < matches[2].End+q.MinLength: // The first and third matches don't overlap, but there's no room for // another match between them. Emit the first match and discard the second. e.emit(matches[2]) - matches = [3]AbsoluteMatch{ + matches = [3]absoluteMatch{ matches[0], - AbsoluteMatch{}, - AbsoluteMatch{}, + absoluteMatch{}, + absoluteMatch{}, } default: // Emit the first match, shortening it if necessary to avoid overlap with the second. e.trim(matches[2], matches[1].Start, q.MinLength) - matches[2] = AbsoluteMatch{} + matches[2] = absoluteMatch{} } } // We've found all the matches now; emit the remaining ones. - if matches[1] != (AbsoluteMatch{}) { + if matches[1] != (absoluteMatch{}) { e.trim(matches[1], matches[0].Start, q.MinLength) } - if matches[0] != (AbsoluteMatch{}) { + if matches[0] != (absoluteMatch{}) { e.emit(matches[0]) }