Skip to content

Commit

Permalink
Replace the MD5 hashing of images with xxHash
Browse files Browse the repository at this point in the history
Note that we only use this for change detection.

The previous implementation invoked `MD5FromReaderFast` that created a MD5 has from 8 64 bytes chunks in the file, which is obviously very fast. The new implementation creates the hash from the entire file and ... seems to be even more effective:

```
name          old time/op    new time/op    delta
HashImage-10    9.45µs ±21%   10.89µs ± 1%     ~     (p=0.343 n=4+4)

name          old alloc/op   new alloc/op   delta
HashImage-10      144B ± 0%        8B ± 0%  -94.44%  (p=0.029 n=4+4)

name          old allocs/op  new allocs/op  delta
HashImage-10      4.00 ± 0%      1.00 ± 0%  -75.00%  (p=0.029 n=4+4)
```
  • Loading branch information
bep committed Jul 31, 2024
1 parent 8b5d796 commit d5eda13
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 29 deletions.
86 changes: 86 additions & 0 deletions common/hashing/hashing.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright 2024 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package hashing provides common hashing utilities.
package hashing

import (
"encoding/hex"
"io"
"sync"

"github.com/cespare/xxhash/v2"
)

// XXHashFromReader calculates the xxHash for the given reader.
func XXHashFromReader(r io.ReadSeeker) (uint64, int64, error) {
h := getXxHashReadFrom()
defer putXxHashReadFrom(h)

size, err := io.Copy(h, r)
if err != nil {
return 0, 0, err
}
return h.Sum64(), size, nil
}

// XXHashFromString calculates the xxHash for the given string.
func XXHashFromString(s string) (uint64, error) {
h := xxhash.New()
h.WriteString(s)
return h.Sum64(), nil
}

// XxHashFromStringHexEncoded calculates the xxHash for the given string
// and returns the hash as a hex encoded string.
func XxHashFromStringHexEncoded(f string) string {
h := xxhash.New()
h.WriteString(f)
hash := h.Sum(nil)
return hex.EncodeToString(hash)
}

type xxhashReadFrom struct {
buff []byte
*xxhash.Digest
}

func (x *xxhashReadFrom) ReadFrom(r io.Reader) (int64, error) {
for {
n, err := r.Read(x.buff)
if n > 0 {
x.Digest.Write(x.buff[:n])
}
if err != nil {
if err == io.EOF {
err = nil
}
return int64(n), err
}
}
}

var xXhashReadFromPool = sync.Pool{
New: func() any {
return &xxhashReadFrom{Digest: xxhash.New(), buff: make([]byte, 48*1024)}
},
}

func getXxHashReadFrom() *xxhashReadFrom {
return xXhashReadFromPool.Get().(*xxhashReadFrom)
}

func putXxHashReadFrom(h *xxhashReadFrom) {
h.Reset()
xXhashReadFromPool.Put(h)
}
79 changes: 79 additions & 0 deletions common/hashing/hashing_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Copyright 2024 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hashing

import (
"strings"
"testing"

"github.com/cespare/xxhash/v2"
qt "github.com/frankban/quicktest"
)

func TestXxHashFromReader(t *testing.T) {
c := qt.New(t)
s := "Hello World"
r := strings.NewReader(s)
got, size, err := XXHashFromReader(r)
c.Assert(err, qt.IsNil)
c.Assert(size, qt.Equals, int64(len(s)))
c.Assert(got, qt.Equals, uint64(7148569436472236994))
}

func TestXxHashFromString(t *testing.T) {
c := qt.New(t)
s := "Hello World"
got, err := XXHashFromString(s)
c.Assert(err, qt.IsNil)
c.Assert(got, qt.Equals, uint64(7148569436472236994))
}

func TestXxHashFromStringHexEncoded(t *testing.T) {
c := qt.New(t)
s := "The quick brown fox jumps over the lazy dog"
got := XxHashFromStringHexEncoded(s)
// Facit: https://asecuritysite.com/encryption/xxhash?val=The%20quick%20brown%20fox%20jumps%20over%20the%20lazy%20dog
c.Assert(got, qt.Equals, "0b242d361fda71bc")
}

func BenchmarkXXHashFromReader(b *testing.B) {
r := strings.NewReader("Hello World")
b.ResetTimer()
for i := 0; i < b.N; i++ {
XXHashFromReader(r)
r.Seek(0, 0)
}
}

func BenchmarkXXHashFromString(b *testing.B) {
s := "Hello World"
b.ResetTimer()
for i := 0; i < b.N; i++ {
XXHashFromString(s)
}
}

func BenchmarkXXHashFromStringHexEncoded(b *testing.B) {
s := "The quick brown fox jumps over the lazy dog"
b.ResetTimer()
for i := 0; i < b.N; i++ {
XxHashFromStringHexEncoded(s)
}
}

func xxHashFromString(f string) uint64 {
h := xxhash.New()
h.WriteString(f)
return h.Sum64()
}
13 changes: 3 additions & 10 deletions helpers/general.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,11 @@ import (
"unicode"
"unicode/utf8"

"github.com/cespare/xxhash/v2"
bp "github.com/gohugoio/hugo/bufferpool"

"github.com/spf13/afero"

"github.com/jdkato/prose/transform"

bp "github.com/gohugoio/hugo/bufferpool"
)

// FilePathSeparator as defined by os.Separator.
Expand Down Expand Up @@ -258,13 +257,7 @@ func SliceToLower(s []string) []string {
return l
}

// XxHashString takes a string and returns its xxHash hash.
func XxHashString(f string) string {
h := xxhash.New()
h.WriteString(f)
hash := h.Sum(nil)
return hex.EncodeToString(hash)
}
// XXHashFromReader creates a xxHash hash from the given reader.

// MD5String takes a string and returns its MD5 hash.
func MD5String(f string) string {
Expand Down
2 changes: 1 addition & 1 deletion resources/image.go
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ func (i *imageResource) relTargetPathFromConfig(conf images.ImageConfig) interna
}

h := i.hash()
idStr := fmt.Sprintf("_hu%s_%d", h, i.size())
idStr := fmt.Sprintf("_hu%d_%d", h, i.size())

// Do not change for no good reason.
const md5Threshold = 100
Expand Down
13 changes: 7 additions & 6 deletions resources/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/gohugoio/hugo/identity"
"github.com/gohugoio/hugo/resources/internal"

"github.com/gohugoio/hugo/common/hashing"
"github.com/gohugoio/hugo/common/herrors"
"github.com/gohugoio/hugo/common/paths"

Expand Down Expand Up @@ -307,7 +308,7 @@ type fileInfo interface {
}

type hashProvider interface {
hash() string
hash() uint64
}

var _ resource.StaleInfo = (*StaleValue[any])(nil)
Expand Down Expand Up @@ -403,7 +404,7 @@ func (l *genericResource) size() int64 {
return l.h.size
}

func (l *genericResource) hash() string {
func (l *genericResource) hash() uint64 {
if err := l.h.init(l); err != nil {
panic(err)
}
Expand Down Expand Up @@ -628,15 +629,15 @@ type targetPather interface {
}

type resourceHash struct {
value string
value uint64
size int64
initOnce sync.Once
}

func (r *resourceHash) init(l hugio.ReadSeekCloserProvider) error {
var initErr error
r.initOnce.Do(func() {
var hash string
var hash uint64
var size int64
f, err := l.ReadSeekCloser()
if err != nil {
Expand All @@ -656,6 +657,6 @@ func (r *resourceHash) init(l hugio.ReadSeekCloserProvider) error {
return initErr
}

func hashImage(r io.ReadSeeker) (string, int64, error) {
return helpers.MD5FromReaderFast(r)
func hashImage(r io.ReadSeeker) (uint64, int64, error) {
return hashing.XXHashFromReader(r)
}
12 changes: 2 additions & 10 deletions tpl/hash/hash.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@ package hash

import (
"context"
"encoding/hex"
"hash/fnv"

"github.com/cespare/xxhash/v2"
"github.com/gohugoio/hugo/common/hashing"
"github.com/gohugoio/hugo/deps"
"github.com/gohugoio/hugo/tpl/internal"
"github.com/spf13/cast"
Expand Down Expand Up @@ -51,14 +50,7 @@ func (ns *Namespace) XxHash(v any) (string, error) {
return "", err
}

hasher := xxhash.New()

_, err = hasher.WriteString(conv)
if err != nil {
return "", err
}
hash := hasher.Sum(nil)
return hex.EncodeToString(hash), nil
return hashing.XxHashFromStringHexEncoded(conv), nil
}

const name = "hash"
Expand Down
4 changes: 2 additions & 2 deletions tpl/tplimpl/template_ast_transformers.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ import (
"fmt"
"strings"

"github.com/gohugoio/hugo/helpers"
htmltemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/htmltemplate"
texttemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/texttemplate"

"github.com/gohugoio/hugo/tpl/internal/go_templates/texttemplate/parse"

"github.com/gohugoio/hugo/common/hashing"
"github.com/gohugoio/hugo/common/maps"
"github.com/gohugoio/hugo/tpl"
"github.com/mitchellh/mapstructure"
Expand Down Expand Up @@ -254,7 +254,7 @@ func (c *templateContext) handleDefer(withNode *parse.WithNode) {
c.err = errors.New("resources.PostProcess cannot be used in a deferred template")
return
}
innerHash := helpers.XxHashString(s)
innerHash := hashing.XxHashFromStringHexEncoded(s)
deferredID := tpl.HugoDeferredTemplatePrefix + innerHash

c.deferNodes[deferredID] = inner
Expand Down

0 comments on commit d5eda13

Please sign in to comment.