Skip to content

Commit

Permalink
Consolidate all hashing to the common/hashing package
Browse files Browse the repository at this point in the history
And remove now unsued hashing funcs.
  • Loading branch information
bep committed Jul 31, 2024
1 parent d5eda13 commit e67886c
Show file tree
Hide file tree
Showing 125 changed files with 177 additions and 368 deletions.
80 changes: 80 additions & 0 deletions common/hashing/hashing.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@
package hashing

import (
"crypto/md5"
"encoding/hex"
"io"
"strconv"
"sync"

"github.com/cespare/xxhash/v2"
"github.com/gohugoio/hashstructure"
"github.com/gohugoio/hugo/identity"
)

// XXHashFromReader calculates the xxHash for the given reader.
Expand Down Expand Up @@ -50,6 +54,82 @@ func XxHashFromStringHexEncoded(f string) string {
return hex.EncodeToString(hash)
}

// MD5FromStringHexEncoded returns the MD5 hash of the given string.
func MD5FromStringHexEncoded(f string) string {
h := md5.New()
h.Write([]byte(f))
return hex.EncodeToString(h.Sum(nil))
}

// HashString returns a hash from the given elements.
// It will panic if the hash cannot be calculated.
// Note that this hash should be used primarily for identity, not for change detection as
// it in the more complex values (e.g. Page) will not hash the full content.
func HashString(vs ...any) string {
hash := HashUint64(vs...)
return strconv.FormatUint(hash, 10)
}

var hashOptsPool = sync.Pool{
New: func() any {
return &hashstructure.HashOptions{
Hasher: xxhash.New(),
}
},
}

func getHashOpts() *hashstructure.HashOptions {
return hashOptsPool.Get().(*hashstructure.HashOptions)
}

func putHashOpts(opts *hashstructure.HashOptions) {
opts.Hasher.Reset()
hashOptsPool.Put(opts)
}

// HashUint64 returns a hash from the given elements.
// It will panic if the hash cannot be calculated.
// Note that this hash should be used primarily for identity, not for change detection as
// it in the more complex values (e.g. Page) will not hash the full content.
func HashUint64(vs ...any) uint64 {
var o any
if len(vs) == 1 {
o = toHashable(vs[0])
} else {
elements := make([]any, len(vs))
for i, e := range vs {
elements[i] = toHashable(e)
}
o = elements
}

hashOpts := getHashOpts()
defer putHashOpts(hashOpts)

hash, err := hashstructure.Hash(o, hashOpts)
if err != nil {
panic(err)
}
return hash
}

type keyer interface {
Key() string
}

// For structs, hashstructure.Hash only works on the exported fields,
// so rewrite the input slice for known identity types.
func toHashable(v any) any {
switch t := v.(type) {
case keyer:
return t.Key()
case identity.IdentityProvider:
return t.GetIdentity()
default:
return v
}
}

type xxhashReadFrom struct {
buff []byte
*xxhash.Digest
Expand Down
50 changes: 45 additions & 5 deletions common/hashing/hashing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
package hashing

import (
"fmt"
"math"
"strings"
"testing"

"github.com/cespare/xxhash/v2"
qt "github.com/frankban/quicktest"
)

Expand Down Expand Up @@ -72,8 +73,47 @@ func BenchmarkXXHashFromStringHexEncoded(b *testing.B) {
}
}

func xxHashFromString(f string) uint64 {
h := xxhash.New()
h.WriteString(f)
return h.Sum64()
func TestHashString(t *testing.T) {
c := qt.New(t)

c.Assert(HashString("a", "b"), qt.Equals, "3176555414984061461")
c.Assert(HashString("ab"), qt.Equals, "7347350983217793633")

var vals []any = []any{"a", "b", tstKeyer{"c"}}

c.Assert(HashString(vals...), qt.Equals, "4438730547989914315")
c.Assert(vals[2], qt.Equals, tstKeyer{"c"})
}

type tstKeyer struct {
key string
}

func (t tstKeyer) Key() string {
return t.key
}

func (t tstKeyer) String() string {
return "key: " + t.key
}

func BenchmarkHashString(b *testing.B) {
word := " hello "

var tests []string

for i := 1; i <= 5; i++ {
sentence := strings.Repeat(word, int(math.Pow(4, float64(i))))
tests = append(tests, sentence)
}

b.ResetTimer()

for _, test := range tests {
b.Run(fmt.Sprintf("n%d", len(test)), func(b *testing.B) {
for i := 0; i < b.N; i++ {
HashString(test)
}
})
}
}
4 changes: 2 additions & 2 deletions common/loggers/handlersmisc.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (
"sync"

"github.com/bep/logg"
"github.com/gohugoio/hugo/identity"
"github.com/gohugoio/hugo/common/hashing"
)

// PanicOnWarningHook panics on warnings.
Expand Down Expand Up @@ -85,7 +85,7 @@ func (h *logOnceHandler) HandleLog(e *logg.Entry) error {
}
h.mu.Lock()
defer h.mu.Unlock()
hash := identity.HashUint64(e.Level, e.Message, e.Fields)
hash := hashing.HashUint64(e.Level, e.Message, e.Fields)
if h.seen[hash] {
return errStop
}
Expand Down
4 changes: 2 additions & 2 deletions config/namespace.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ package config
import (
"encoding/json"

"github.com/gohugoio/hugo/identity"
"github.com/gohugoio/hugo/common/hashing"
)

func DecodeNamespace[S, C any](configSource any, buildConfig func(any) (C, any, error)) (*ConfigNamespace[S, C], error) {
// Calculate the hash of the input (not including any defaults applied later).
// This allows us to introduce new config options without breaking the hash.
h := identity.HashString(configSource)
h := hashing.HashString(configSource)

// Build the config
c, ext, err := buildConfig(configSource)
Expand Down
62 changes: 0 additions & 62 deletions helpers/general.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ package helpers

import (
"bytes"
"crypto/md5"
"encoding/hex"
"fmt"
"io"
"net"
Expand Down Expand Up @@ -257,66 +255,6 @@ func SliceToLower(s []string) []string {
return l
}

// XXHashFromReader creates a xxHash hash from the given reader.

// MD5String takes a string and returns its MD5 hash.
func MD5String(f string) string {
h := md5.New()
h.Write([]byte(f))
return hex.EncodeToString(h.Sum([]byte{}))
}

// MD5FromReaderFast creates a MD5 hash from the given file. It only reads parts of
// the file for speed, so don't use it if the files are very subtly different.
// It will not close the file.
// It will return the MD5 hash and the size of r in bytes.
func MD5FromReaderFast(r io.ReadSeeker) (string, int64, error) {
const (
// Do not change once set in stone!
maxChunks = 8
peekSize = 64
seek = 2048
)

h := md5.New()
buff := make([]byte, peekSize)

for i := 0; i < maxChunks; i++ {
if i > 0 {
_, err := r.Seek(seek, 0)
if err != nil {
if err == io.EOF {
break
}
return "", 0, err
}
}

_, err := io.ReadAtLeast(r, buff, peekSize)
if err != nil {
if err == io.EOF || err == io.ErrUnexpectedEOF {
h.Write(buff)
break
}
return "", 0, err
}
h.Write(buff)
}

size, _ := r.Seek(0, io.SeekEnd)

return hex.EncodeToString(h.Sum(nil)), size, nil
}

// MD5FromReader creates a MD5 hash from the given reader.
func MD5FromReader(r io.Reader) (string, error) {
h := md5.New()
if _, err := io.Copy(h, r); err != nil {
return "", nil
}
return hex.EncodeToString(h.Sum(nil)), nil
}

// IsWhitespace determines if the given rune is whitespace.
func IsWhitespace(r rune) bool {
return r == ' ' || r == '\t' || r == '\n' || r == '\r'
Expand Down
89 changes: 0 additions & 89 deletions helpers/general_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,13 @@
package helpers_test

import (
"fmt"
"reflect"
"strings"
"testing"

"github.com/gohugoio/hugo/helpers"

qt "github.com/frankban/quicktest"
"github.com/spf13/afero"
)

func TestResolveMarkup(t *testing.T) {
Expand Down Expand Up @@ -256,93 +254,6 @@ func TestUniqueStringsSorted(t *testing.T) {
c.Assert(helpers.UniqueStringsSorted(nil), qt.IsNil)
}

func TestFastMD5FromFile(t *testing.T) {
fs := afero.NewMemMapFs()

if err := afero.WriteFile(fs, "small.txt", []byte("abc"), 0o777); err != nil {
t.Fatal(err)
}

if err := afero.WriteFile(fs, "small2.txt", []byte("abd"), 0o777); err != nil {
t.Fatal(err)
}

if err := afero.WriteFile(fs, "bigger.txt", []byte(strings.Repeat("a bc d e", 100)), 0o777); err != nil {
t.Fatal(err)
}

if err := afero.WriteFile(fs, "bigger2.txt", []byte(strings.Repeat("c d e f g", 100)), 0o777); err != nil {
t.Fatal(err)
}

c := qt.New(t)

sf1, err := fs.Open("small.txt")
c.Assert(err, qt.IsNil)
sf2, err := fs.Open("small2.txt")
c.Assert(err, qt.IsNil)

bf1, err := fs.Open("bigger.txt")
c.Assert(err, qt.IsNil)
bf2, err := fs.Open("bigger2.txt")
c.Assert(err, qt.IsNil)

defer sf1.Close()
defer sf2.Close()
defer bf1.Close()
defer bf2.Close()

m1, _, err := helpers.MD5FromReaderFast(sf1)
c.Assert(err, qt.IsNil)
c.Assert(m1, qt.Equals, "e9c8989b64b71a88b4efb66ad05eea96")

m2, _, err := helpers.MD5FromReaderFast(sf2)
c.Assert(err, qt.IsNil)
c.Assert(m2, qt.Not(qt.Equals), m1)

m3, _, err := helpers.MD5FromReaderFast(bf1)
c.Assert(err, qt.IsNil)
c.Assert(m3, qt.Not(qt.Equals), m2)

m4, _, err := helpers.MD5FromReaderFast(bf2)
c.Assert(err, qt.IsNil)
c.Assert(m4, qt.Not(qt.Equals), m3)

m5, err := helpers.MD5FromReader(bf2)
c.Assert(err, qt.IsNil)
c.Assert(m5, qt.Not(qt.Equals), m4)
}

func BenchmarkMD5FromFileFast(b *testing.B) {
fs := afero.NewMemMapFs()

for _, full := range []bool{false, true} {
b.Run(fmt.Sprintf("full=%t", full), func(b *testing.B) {
for i := 0; i < b.N; i++ {
b.StopTimer()
if err := afero.WriteFile(fs, "file.txt", []byte(strings.Repeat("1234567890", 2000)), 0o777); err != nil {
b.Fatal(err)
}
f, err := fs.Open("file.txt")
if err != nil {
b.Fatal(err)
}
b.StartTimer()
if full {
if _, err := helpers.MD5FromReader(f); err != nil {
b.Fatal(err)
}
} else {
if _, _, err := helpers.MD5FromReaderFast(f); err != nil {
b.Fatal(err)
}
}
f.Close()
}
})
}
}

func BenchmarkUniqueStrings(b *testing.B) {
input := []string{"a", "b", "d", "e", "d", "h", "a", "i"}

Expand Down
Loading

0 comments on commit e67886c

Please sign in to comment.