From 24b2bfad2d0c50a17d75dbb35e208adcbc1c4708 Mon Sep 17 00:00:00 2001 From: Andy Balholm Date: Tue, 2 Jan 2024 13:38:12 -0800 Subject: [PATCH] matchfinder.M4: add Score function --- bitwriter.go | 6 ++++++ brotli_test.go | 22 +++++++++---------- matchfinder/emitter.go | 8 +++---- matchfinder/m4.go | 48 +++++++++++++++++++++++++----------------- 4 files changed, 50 insertions(+), 34 deletions(-) diff --git a/bitwriter.go b/bitwriter.go index dfc6036..89af705 100644 --- a/bitwriter.go +++ b/bitwriter.go @@ -1,5 +1,7 @@ package brotli +import "github.com/andybalholm/brotli/matchfinder" + /* Copyright 2010 Google Inc. All Rights Reserved. Distributed under MIT license. @@ -54,3 +56,7 @@ func (w *bitWriter) jumpToByteBoundary() { w.bits = 0 w.dst = dst } + +func matchScore(m matchfinder.AbsoluteMatch) int { + return int(backwardReferenceScore(uint(m.End-m.Start), uint(m.Start-m.Match))) +} diff --git a/brotli_test.go b/brotli_test.go index 54c88ac..bbef7c9 100644 --- a/brotli_test.go +++ b/brotli_test.go @@ -651,45 +651,45 @@ func benchmark(b *testing.B, filename string, m matchfinder.MatchFinder, blockSi } func TestEncodeM4(t *testing.T) { - test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 18}, 1<<16) + test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 18, Score: matchScore}, 1<<16) } func BenchmarkEncodeM4(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, Score: matchScore}, 1<<16) } func TestEncodeM4Chain1(t *testing.T) { - test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 18, ChainLength: 1}, 1<<16) + test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 18, ChainLength: 1, Score: matchScore}, 1<<16) } func BenchmarkEncodeM4Chain1(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 1}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 1, Score: matchScore}, 1<<16) } func BenchmarkEncodeM4Chain2(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 2}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 2, Score: matchScore}, 1<<16) } func BenchmarkEncodeM4Chain4(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 4}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 4, Score: matchScore}, 1<<16) } func BenchmarkEncodeM4Chain8(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 8}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 8, HashLen: 5, Score: matchScore}, 1<<16) } func BenchmarkEncodeM4Chain16(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 16}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 16, HashLen: 5, Score: matchScore}, 1<<16) } func BenchmarkEncodeM4Chain32(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 32}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 32, HashLen: 5, Score: matchScore}, 1<<16) } func BenchmarkEncodeM4Chain64(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 64}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 64, HashLen: 5, Score: matchScore}, 1<<16) } func BenchmarkEncodeM4Chain128(b *testing.B) { - benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 128}, 1<<16) + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 128, HashLen: 5, Score: matchScore}, 1<<16) } diff --git a/matchfinder/emitter.go b/matchfinder/emitter.go index 37ed8e1..b80659f 100644 --- a/matchfinder/emitter.go +++ b/matchfinder/emitter.go @@ -1,8 +1,8 @@ package matchfinder -// An absoluteMatch is like a Match, but it stores indexes into the byte +// An AbsoluteMatch is like a Match, but it stores indexes into the byte // stream instead of lengths. -type absoluteMatch struct { +type AbsoluteMatch struct { // Start is the index of the first byte. Start int @@ -24,7 +24,7 @@ type matchEmitter struct { NextEmit int } -func (e *matchEmitter) emit(m absoluteMatch) { +func (e *matchEmitter) emit(m AbsoluteMatch) { e.Dst = append(e.Dst, Match{ Unmatched: m.Start - e.NextEmit, Length: m.End - m.Start, @@ -35,7 +35,7 @@ func (e *matchEmitter) emit(m absoluteMatch) { // trim shortens m if it extends past maxEnd. Then if the length is at least // minLength, the match is emitted. -func (e *matchEmitter) trim(m absoluteMatch, maxEnd int, minLength int) { +func (e *matchEmitter) trim(m AbsoluteMatch, maxEnd int, minLength int) { if m.End > maxEnd { m.End = maxEnd } diff --git a/matchfinder/m4.go b/matchfinder/m4.go index 6233308..611a475 100644 --- a/matchfinder/m4.go +++ b/matchfinder/m4.go @@ -32,6 +32,10 @@ type M4 struct { // locations with the same hash as the current location. ChainLength int + // Score is the rating function used to choose the best match. + // The default is the length of the match. + Score func(AbsoluteMatch) int + table []uint32 chain []uint16 @@ -62,6 +66,12 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { if len(q.table) < 1< q.MaxDistance*2 { @@ -92,16 +102,16 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { // matches stores the matches that have been found but not emitted, // in reverse order. (matches[0] is the most recent one.) - var matches [3]absoluteMatch + var matches [3]AbsoluteMatch for i := e.NextEmit; i < len(src)-7; i++ { - if matches[0] != (absoluteMatch{}) && i >= matches[0].End { + if matches[0] != (AbsoluteMatch{}) && i >= matches[0].End { // We have found some matches, and we're far enough along that we probably // won't find overlapping matches, so we might as well emit them. - if matches[1] != (absoluteMatch{}) { + if matches[1] != (AbsoluteMatch{}) { e.trim(matches[1], matches[0].Start, q.MinLength) } e.emit(matches[0]) - matches = [3]absoluteMatch{} + matches = [3]AbsoluteMatch{} } // Calculate and store the hash. @@ -123,7 +133,7 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { } // Look for a match. - var currentMatch absoluteMatch + var currentMatch AbsoluteMatch if i-candidate != matches[0].Start-matches[0].Match { if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) { @@ -146,24 +156,24 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { if i-candidate != matches[0].Start-matches[0].Match { if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) { m := extendMatch2(src, i, candidate, e.NextEmit) - if m.End-m.Start > q.MinLength && m.End-m.Start > currentMatch.End-currentMatch.Start { + if m.End-m.Start > q.MinLength && q.Score(m) > q.Score(currentMatch) { currentMatch = m } } } } - if currentMatch.End-currentMatch.Start <= matches[0].End-matches[0].Start { + if q.Score(currentMatch) <= q.Score(matches[0]) { continue } - matches = [3]absoluteMatch{ + matches = [3]AbsoluteMatch{ currentMatch, matches[0], matches[1], } - if matches[2] == (absoluteMatch{}) { + if matches[2] == (AbsoluteMatch{}) { continue } @@ -171,34 +181,34 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { switch { case matches[0].Start < matches[2].End: // The first and third matches overlap; discard the one in between. - matches = [3]absoluteMatch{ + matches = [3]AbsoluteMatch{ matches[0], matches[2], - absoluteMatch{}, + AbsoluteMatch{}, } case matches[0].Start < matches[2].End+q.MinLength: // The first and third matches don't overlap, but there's no room for // another match between them. Emit the first match and discard the second. e.emit(matches[2]) - matches = [3]absoluteMatch{ + matches = [3]AbsoluteMatch{ matches[0], - absoluteMatch{}, - absoluteMatch{}, + AbsoluteMatch{}, + AbsoluteMatch{}, } default: // Emit the first match, shortening it if necessary to avoid overlap with the second. e.trim(matches[2], matches[1].Start, q.MinLength) - matches[2] = absoluteMatch{} + matches[2] = AbsoluteMatch{} } } // We've found all the matches now; emit the remaining ones. - if matches[1] != (absoluteMatch{}) { + if matches[1] != (AbsoluteMatch{}) { e.trim(matches[1], matches[0].Start, q.MinLength) } - if matches[0] != (absoluteMatch{}) { + if matches[0] != (AbsoluteMatch{}) { e.emit(matches[0]) } @@ -255,13 +265,13 @@ func extendMatch(src []byte, i, j int) int { // Given a 4-byte match at src[start] and src[candidate], extendMatch2 extends it // upward as far as possible, and downward no farther than to min. -func extendMatch2(src []byte, start, candidate, min int) absoluteMatch { +func extendMatch2(src []byte, start, candidate, min int) AbsoluteMatch { end := extendMatch(src, candidate+4, start+4) for start > min && candidate > 0 && src[start-1] == src[candidate-1] { start-- candidate-- } - return absoluteMatch{ + return AbsoluteMatch{ Start: start, End: end, Match: candidate,