rgmz · rgmz · Mar 10, 2024
diff --git a/hack/snifftest/main.go b/hack/snifftest/main.go
@@ -122,7 +122,7 @@ func main() {
 				for chunk := range chunksChan {
 					for name, scanner := range selectedScanners {
 						for _, dec := range allDecoders {
-							decoded := dec.FromChunk(&sources.Chunk{Data: chunk.Data})
+							decoded := dec.FromChunk(ctx, &sources.Chunk{Data: chunk.Data})
 							if decoded != nil {
 								foundKeyword := false
 								for _, kw := range scanner.Keywords() {

diff --git a/pkg/decoders/base64.go b/pkg/decoders/base64.go
@@ -5,6 +5,7 @@ import (
 	"encoding/base64"
 	"unicode"
 
+	"github.com/trufflesecurity/trufflehog/v3/pkg/context"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
 )
@@ -31,7 +32,7 @@ func (d *Base64) Type() detectorspb.DecoderType {
 	return detectorspb.DecoderType_BASE64
 }
 
-func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
+func (d *Base64) FromChunk(_ context.Context, chunk *sources.Chunk) *DecodableChunk {
 	decodableChunk := &DecodableChunk{Chunk: chunk, DecoderType: d.Type()}
 	encodedSubstrings := getSubstringsOfCharacterSet(chunk.Data, 20, b64CharsetMapping, b64EndChars)
 	decodedSubstrings := make(map[string][]byte)

diff --git a/pkg/decoders/base64_test.go b/pkg/decoders/base64_test.go
@@ -5,6 +5,7 @@ import (
 
 	"github.com/kylelemons/godebug/pretty"
 
+	"github.com/trufflesecurity/trufflehog/v3/pkg/context"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
 )
@@ -134,7 +135,7 @@ func TestBase64_FromChunk(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			d := &Base64{}
-			got := d.FromChunk(tt.chunk)
+			got := d.FromChunk(context.Background(), tt.chunk)
 			if tt.want != nil {
 				if got == nil {
 					t.Fatal("got nil, did not want nil")
@@ -156,7 +157,7 @@ func BenchmarkFromChunkSmall(b *testing.B) {
 	data := detectors.MustGetBenchmarkData()["small"]
 
 	for n := 0; n < b.N; n++ {
-		d.FromChunk(&sources.Chunk{Data: data})
+		d.FromChunk(context.Background(), &sources.Chunk{Data: data})
 	}
 }
 
@@ -165,7 +166,7 @@ func BenchmarkFromChunkMedium(b *testing.B) {
 	data := detectors.MustGetBenchmarkData()["medium"]
 
 	for n := 0; n < b.N; n++ {
-		d.FromChunk(&sources.Chunk{Data: data})
+		d.FromChunk(context.Background(), &sources.Chunk{Data: data})
 	}
 }
 
@@ -174,6 +175,6 @@ func BenchmarkFromChunkLarge(b *testing.B) {
 	data := detectors.MustGetBenchmarkData()["big"]
 
 	for n := 0; n < b.N; n++ {
-		d.FromChunk(&sources.Chunk{Data: data})
+		d.FromChunk(context.Background(), &sources.Chunk{Data: data})
 	}
 }
diff --git a/pkg/decoders/decoders.go b/pkg/decoders/decoders.go
@@ -1,6 +1,7 @@
 package decoders
 
 import (
+	"github.com/trufflesecurity/trufflehog/v3/pkg/context"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
 )
@@ -12,6 +13,7 @@ func DefaultDecoders() []Decoder {
 		&Base64{},
 		&UTF16{},
 		&EscapedUnicode{},
+		&HtmlEntity{},
 	}
 }
 
@@ -23,21 +25,22 @@ type DecodableChunk struct {
 }
 
 type Decoder interface {
-	FromChunk(chunk *sources.Chunk) *DecodableChunk
+	FromChunk(ctx context.Context, chunk *sources.Chunk) *DecodableChunk
 	Type() detectorspb.DecoderType
 }
 
 // Fuzz is an entrypoint for go-fuzz, which is an AFL-style fuzzing tool.
 // This one attempts to uncover any panics during decoding.
 func Fuzz(data []byte) int {
 	decoded := false
+	ctx := context.Background()
 	for i, decoder := range DefaultDecoders() {
 		// Skip the first decoder (plain), because it will always decode and give
 		// priority to the input (return 1).
 		if i == 0 {
 			continue
 		}
-		chunk := decoder.FromChunk(&sources.Chunk{Data: data})
+		chunk := decoder.FromChunk(ctx, &sources.Chunk{Data: data})
 		if chunk != nil {
 			decoded = true
 		}

diff --git a/pkg/decoders/escaped_unicode.go b/pkg/decoders/escaped_unicode.go
@@ -6,6 +6,7 @@ import (
 	"strconv"
 	"unicode/utf8"
 
+	"github.com/trufflesecurity/trufflehog/v3/pkg/context"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
 )
@@ -18,7 +19,7 @@ var _ Decoder = (*EscapedUnicode)(nil)
 // https://dencode.com/en/string/unicode-escape
 var (
 	// Standard Unicode notation.
-	//https://unicode.org/standard/principles.html
+	// https://unicode.org/standard/principles.html
 	codePointPat = regexp.MustCompile(`\bU\+([a-fA-F0-9]{4}).?`)
 
 	// Common escape sequence used in programming languages.
@@ -29,7 +30,7 @@ func (d *EscapedUnicode) Type() detectorspb.DecoderType {
 	return detectorspb.DecoderType_ESCAPED_UNICODE
 }
 
-func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
+func (d *EscapedUnicode) FromChunk(_ context.Context, chunk *sources.Chunk) *DecodableChunk {
 	if chunk == nil || len(chunk.Data) == 0 {
 		return nil
 	}

diff --git a/pkg/decoders/escaped_unicode_test.go b/pkg/decoders/escaped_unicode_test.go
@@ -5,6 +5,7 @@ import (
 
 	"github.com/kylelemons/godebug/pretty"
 
+	"github.com/trufflesecurity/trufflehog/v3/pkg/context"
 	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
 )
 
@@ -68,7 +69,7 @@ func TestUnicodeEscape_FromChunk(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			d := &EscapedUnicode{}
-			got := d.FromChunk(tt.chunk)
+			got := d.FromChunk(context.Background(), tt.chunk)
 			if tt.want != nil {
 				if got == nil {
 					t.Fatal("got nil, did not want nil")

diff --git a/pkg/decoders/html_entity.go b/pkg/decoders/html_entity.go
@@ -0,0 +1,219 @@
+package decoders
+
+import (
+	"bytes"
+	"errors"
+	"regexp"
+	"strconv"
+	"strings"
+	"sync"
+
+	ahocorasick "github.com/BobuSumisu/aho-corasick"
+	"github.com/go-logr/logr"
+	"golang.org/x/exp/maps"
+
+	"github.com/trufflesecurity/trufflehog/v3/pkg/context"
+	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
+	"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
+)
+
+// HtmlEntity decodes characters that are encoded as decimal, hexadecimal, or named entities.
+// https://www.ee.ucl.ac.uk/~mflanaga/java/HTMLandASCIItableC1.html
+type HtmlEntity struct{}
+
+var (
+	_ Decoder = (*HtmlEntity)(nil)
+
+	once     sync.Once
+	htmlTrie *ahocorasick.Trie
+)
+
+func init() {
+	// Use Aho-Corasick to pre-filter potential matches.
+	once.Do(func() {
+		keywords := map[string]struct{}{
+			`&#`:  {}, // decimal
+			`&#x`: {}, // hex
+		}
+		for entity := range namedEntityMap {
+			keywords[strings.ToLower(entity)] = struct{}{}
+		}
+		htmlTrie = ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(keywords)).Build()
+	})
+}
+
+func (d *HtmlEntity) Type() detectorspb.DecoderType {
+	return detectorspb.DecoderType_HTML
+}
+
+func (d *HtmlEntity) FromChunk(ctx context.Context, chunk *sources.Chunk) *DecodableChunk {
+	if chunk == nil || len(chunk.Data) == 0 {
+		return nil
+	} else if m := htmlTrie.MatchFirst(chunk.Data); m == nil {
+		return nil
+	}
+
+	var (
+		logger = ctx.Logger().WithName("decoders.html")
+		// Necessary to avoid data races.
+		chunkData = bytes.Clone(chunk.Data)
+		matched   = false
+	)
+	if namedEntityPat.Match(chunkData) {
+		matched = true
+		chunkData = decodeNamedEntities(logger, chunkData)
+	}
+	if decimalEntityPat.Match(chunkData) {
+		matched = true
+		chunkData = decodeHtmlDecimal(logger, chunkData)
+	}
+	if hexEntityPat.Match(chunkData) {
+		matched = true
+		chunkData = decodeHtmlHex(logger, chunkData)
+	}
+
+	if matched {
+		return &DecodableChunk{
+			DecoderType: d.Type(),
+			Chunk: &sources.Chunk{
+				Data:           chunkData,
+				SourceName:     chunk.SourceName,
+				SourceID:       chunk.SourceID,
+				JobID:          chunk.JobID,
+				SecretID:       chunk.SecretID,
+				SourceMetadata: chunk.SourceMetadata,
+				SourceType:     chunk.SourceType,
+				Verify:         chunk.Verify,
+			},
+		}
+	} else {
+		return nil
+	}
+}
+
+// `A` = `&#65;`
+var decimalEntityPat = regexp.MustCompile(`&#(\d{1,3});`)
+
+func decodeHtmlDecimal(logger logr.Logger, input []byte) []byte {
+	decoded := make([]byte, 0, len(input))
+	lastIndex := 0
+
+	for _, match := range decimalEntityPat.FindAllSubmatchIndex(input, -1) {
+		startIndex := match[0]
+		endIndex := match[1]
+		decStartIndex := match[2]
+		decEndIndex := match[3]
+
+		// Copy the part of the input until the start of the entity
+		decoded = append(decoded, input[lastIndex:startIndex]...)
+
+		num, err := strconv.Atoi(string(input[decStartIndex:decEndIndex]))
+		if err != nil {
+			continue
+		}
+
+		// Append the decoded byte
+		if num < 0 || num > 255 {
+			logger.Error(errors.New("invalid decimal byte"), "Unable to decode HTML entity", "match", input[decStartIndex:decEndIndex], "byte", num)
+			continue
+		}
+		decoded = append(decoded, byte(num))
@@ -110,4 +110,7 @@
-		// Append the decoded byte
-		decoded = append(decoded, byte(num))
+		// Check if the parsed number is within the valid range for a byte
+		if num >= 0 && num <= 255 {
+			// Append the decoded byte
+			decoded = append(decoded, byte(num))
+		}
@@ -110,4 +110,7 @@

-		// Append the decoded byte
-		decoded = append(decoded, byte(num))
+		// Check if the parsed number is within the valid range for a byte
+		if num >= 0 && num <= 255 {
+			// Append the decoded byte
+			decoded = append(decoded, byte(num))
+		}

+		lastIndex = endIndex
+	}
+
+	// Append the remaining part of the input
+	decoded = append(decoded, input[lastIndex:]...)
+
+	return decoded
+}
+
+// `A` = `&#x1;`
+var hexEntityPat = regexp.MustCompile(`(?i)&#x([a-f0-9]{1,2});`)
+
+func decodeHtmlHex(logger logr.Logger, input []byte) []byte {
+	decoded := make([]byte, 0, len(input))
+	lastIndex := 0
+
+	for _, match := range hexEntityPat.FindAllSubmatchIndex(input, -1) {
+		startIndex := match[0]
+		endIndex := match[1]
+		hexStartIndex := match[2]
+		hexEndIndex := match[3]
+
+		// Copy the part of the input until the start of the entity
+		decoded = append(decoded, input[lastIndex:startIndex]...)
+
+		// Parse the hexadecimal value to an integer
+		num, err := strconv.ParseInt(string(input[hexStartIndex:hexEndIndex]), 16, 32)
+		if err != nil {
+			continue
+		}
+
+		// Append the decoded byte
+		if num < 0 || num > 255 {
+			logger.Error(errors.New("invalid hex byte"), "Unable to decode HTML entity", "match", input[hexStartIndex:hexEndIndex], "byte", num)
+			continue
+		}
+		decoded = append(decoded, byte(num))
@@ -144,2 +144,7 @@
+		// Check if the parsed number is within the valid range for a byte
+		if num < 0 || num > 255 {
+			continue
+		}
+
 		// Append the decoded byte
@@ -144,2 +144,7 @@

+		// Check if the parsed number is within the valid range for a byte
+		if num < 0 || num > 255 {
+			continue
+		}
+
 		// Append the decoded byte
+
+		lastIndex = endIndex
+	}
+
+	// Append the remaining part of the input
+	decoded = append(decoded, input[lastIndex:]...)
+
+	return decoded
+}
+
+var (
+	// https://www.compart.com/en/unicode/html
+	namedEntityMap = map[string][]byte{
+		"&tab;":              []byte("	"),
+		"&newline;":          []byte("\n"),
+		"&excl;":             []byte("!"),
+		"&quot;":             []byte(`"`),
+		"&num;":              []byte("#"),
+		"&dollar;":           []byte("$"),
+		"&percnt;":           []byte("%"),
+		"&amp;":              []byte("&"),
+		"&apos;":             []byte("'"),
+		"&lpar;":             []byte("("),
+		"&rpar;":             []byte(")"),
+		"&ast;":              []byte("*"),
+		"&plus;":             []byte("+"),
+		"&comma;":            []byte(","),
+		"&period;":           []byte("."),
+		"&sol;":              []byte("/"),
+		"&colon;":            []byte(":"),
+		"&semi;":             []byte(";"),
+		"&lt;":               []byte("<"),
+		"&equals;":           []byte("="),
+		"&gt;":               []byte(">"),
+		"&quest;":            []byte("?"),
+		"&commat;":           []byte("@"),
+		"&lsqb;":             []byte("["),
+		"&bsol;":             []byte("\\"),
+		"&rsqb;":             []byte("]"),
+		"&hat;":              []byte("^"),
+		"&underbar;":         []byte("_"),
+		"&diacriticalgrave;": []byte("`"),
+		"&lcub;":             []byte("{"),
+		"&verticalline;":     []byte("|"),
+		"&rcub;":             []byte("}"),
+		"&nonbreakingspace;": []byte(" "),
+	}
+	namedEntityPat = func() *regexp.Regexp {
+		return regexp.MustCompile(
+			"(?i)(" + strings.Join(maps.Keys(namedEntityMap), "|") + ")")
+	}()
+)
+
+func decodeNamedEntities(_ logr.Logger, input []byte) []byte {
+	return namedEntityPat.ReplaceAllFunc(input, func(match []byte) []byte {
+		m := strings.ToLower(string(match))
+		if replacement, ok := namedEntityMap[m]; ok {
+			return replacement
+		}
+		return match
+	})
+}