Skip to content

Commit

Permalink
feat(decoders): HTML entities
Browse files Browse the repository at this point in the history
  • Loading branch information
rgmz committed Dec 31, 2024
1 parent dde8f8a commit ac02868
Show file tree
Hide file tree
Showing 15 changed files with 1,401 additions and 1,059 deletions.
2 changes: 1 addition & 1 deletion hack/snifftest/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ func main() {
for chunk := range chunksChan {
for name, scanner := range selectedScanners {
for _, dec := range allDecoders {
decoded := dec.FromChunk(&sources.Chunk{Data: chunk.Data})
decoded := dec.FromChunk(ctx, &sources.Chunk{Data: chunk.Data})
if decoded != nil {
foundKeyword := false
for _, kw := range scanner.Keywords() {
Expand Down
3 changes: 2 additions & 1 deletion pkg/decoders/base64.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"encoding/base64"
"unicode"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand All @@ -31,7 +32,7 @@ func (d *Base64) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_BASE64
}

func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
func (d *Base64) FromChunk(_ context.Context, chunk *sources.Chunk) *DecodableChunk {
decodableChunk := &DecodableChunk{Chunk: chunk, DecoderType: d.Type()}
encodedSubstrings := getSubstringsOfCharacterSet(chunk.Data, 20, b64CharsetMapping, b64EndChars)
decodedSubstrings := make(map[string][]byte)
Expand Down
9 changes: 5 additions & 4 deletions pkg/decoders/base64_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (

"github.com/kylelemons/godebug/pretty"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand Down Expand Up @@ -134,7 +135,7 @@ func TestBase64_FromChunk(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &Base64{}
got := d.FromChunk(tt.chunk)
got := d.FromChunk(context.Background(), tt.chunk)
if tt.want != nil {
if got == nil {
t.Fatal("got nil, did not want nil")
Expand All @@ -156,7 +157,7 @@ func BenchmarkFromChunkSmall(b *testing.B) {
data := detectors.MustGetBenchmarkData()["small"]

for n := 0; n < b.N; n++ {
d.FromChunk(&sources.Chunk{Data: data})
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
}
}

Expand All @@ -165,7 +166,7 @@ func BenchmarkFromChunkMedium(b *testing.B) {
data := detectors.MustGetBenchmarkData()["medium"]

for n := 0; n < b.N; n++ {
d.FromChunk(&sources.Chunk{Data: data})
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
}
}

Expand All @@ -174,6 +175,6 @@ func BenchmarkFromChunkLarge(b *testing.B) {
data := detectors.MustGetBenchmarkData()["big"]

for n := 0; n < b.N; n++ {
d.FromChunk(&sources.Chunk{Data: data})
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
}
}
7 changes: 5 additions & 2 deletions pkg/decoders/decoders.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package decoders

import (
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand All @@ -12,6 +13,7 @@ func DefaultDecoders() []Decoder {
&Base64{},
&UTF16{},
&EscapedUnicode{},
&HtmlEntity{},
}
}

Expand All @@ -23,21 +25,22 @@ type DecodableChunk struct {
}

type Decoder interface {
FromChunk(chunk *sources.Chunk) *DecodableChunk
FromChunk(ctx context.Context, chunk *sources.Chunk) *DecodableChunk
Type() detectorspb.DecoderType
}

// Fuzz is an entrypoint for go-fuzz, which is an AFL-style fuzzing tool.
// This one attempts to uncover any panics during decoding.
func Fuzz(data []byte) int {
decoded := false
ctx := context.Background()
for i, decoder := range DefaultDecoders() {
// Skip the first decoder (plain), because it will always decode and give
// priority to the input (return 1).
if i == 0 {
continue
}
chunk := decoder.FromChunk(&sources.Chunk{Data: data})
chunk := decoder.FromChunk(ctx, &sources.Chunk{Data: data})
if chunk != nil {
decoded = true
}
Expand Down
5 changes: 3 additions & 2 deletions pkg/decoders/escaped_unicode.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strconv"
"unicode/utf8"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand All @@ -18,7 +19,7 @@ var _ Decoder = (*EscapedUnicode)(nil)
// https://dencode.com/en/string/unicode-escape
var (
// Standard Unicode notation.
//https://unicode.org/standard/principles.html
// https://unicode.org/standard/principles.html
codePointPat = regexp.MustCompile(`\bU\+([a-fA-F0-9]{4}).?`)

// Common escape sequence used in programming languages.
Expand All @@ -29,7 +30,7 @@ func (d *EscapedUnicode) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_ESCAPED_UNICODE
}

func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
func (d *EscapedUnicode) FromChunk(_ context.Context, chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/decoders/escaped_unicode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (

"github.com/kylelemons/godebug/pretty"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

Expand Down Expand Up @@ -68,7 +69,7 @@ func TestUnicodeEscape_FromChunk(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &EscapedUnicode{}
got := d.FromChunk(tt.chunk)
got := d.FromChunk(context.Background(), tt.chunk)
if tt.want != nil {
if got == nil {
t.Fatal("got nil, did not want nil")
Expand Down
219 changes: 219 additions & 0 deletions pkg/decoders/html_entity.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
package decoders

import (
"bytes"
"errors"
"regexp"
"strconv"
"strings"
"sync"

ahocorasick "github.com/BobuSumisu/aho-corasick"
"github.com/go-logr/logr"
"golang.org/x/exp/maps"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

// HtmlEntity decodes characters that are encoded as decimal, hexadecimal, or named entities.
// https://www.ee.ucl.ac.uk/~mflanaga/java/HTMLandASCIItableC1.html
type HtmlEntity struct{}

var (
_ Decoder = (*HtmlEntity)(nil)

once sync.Once
htmlTrie *ahocorasick.Trie
)

func init() {
// Use Aho-Corasick to pre-filter potential matches.
once.Do(func() {
keywords := map[string]struct{}{
`&#`: {}, // decimal
`&#x`: {}, // hex
}
for entity := range namedEntityMap {
keywords[strings.ToLower(entity)] = struct{}{}
}
htmlTrie = ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(keywords)).Build()
})
}

func (d *HtmlEntity) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_HTML
}

func (d *HtmlEntity) FromChunk(ctx context.Context, chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
} else if m := htmlTrie.MatchFirst(chunk.Data); m == nil {
return nil
}

var (
logger = ctx.Logger().WithName("decoders.html")
// Necessary to avoid data races.
chunkData = bytes.Clone(chunk.Data)
matched = false
)
if namedEntityPat.Match(chunkData) {
matched = true
chunkData = decodeNamedEntities(logger, chunkData)
}
if decimalEntityPat.Match(chunkData) {
matched = true
chunkData = decodeHtmlDecimal(logger, chunkData)
}
if hexEntityPat.Match(chunkData) {
matched = true
chunkData = decodeHtmlHex(logger, chunkData)
}

if matched {
return &DecodableChunk{
DecoderType: d.Type(),
Chunk: &sources.Chunk{
Data: chunkData,
SourceName: chunk.SourceName,
SourceID: chunk.SourceID,
JobID: chunk.JobID,
SecretID: chunk.SecretID,
SourceMetadata: chunk.SourceMetadata,
SourceType: chunk.SourceType,
Verify: chunk.Verify,
},
}
} else {
return nil
}
}

// `A` = `&#65;`
var decimalEntityPat = regexp.MustCompile(`&#(\d{1,3});`)

func decodeHtmlDecimal(logger logr.Logger, input []byte) []byte {
decoded := make([]byte, 0, len(input))
lastIndex := 0

for _, match := range decimalEntityPat.FindAllSubmatchIndex(input, -1) {
startIndex := match[0]
endIndex := match[1]
decStartIndex := match[2]
decEndIndex := match[3]

// Copy the part of the input until the start of the entity
decoded = append(decoded, input[lastIndex:startIndex]...)

num, err := strconv.Atoi(string(input[decStartIndex:decEndIndex]))
if err != nil {
continue
}

// Append the decoded byte
if num < 0 || num > 255 {
logger.Error(errors.New("invalid decimal byte"), "Unable to decode HTML entity", "match", input[decStartIndex:decEndIndex], "byte", num)
continue
}
decoded = append(decoded, byte(num))
lastIndex = endIndex
}

// Append the remaining part of the input
decoded = append(decoded, input[lastIndex:]...)

return decoded
}

// `A` = `&#x1;`
var hexEntityPat = regexp.MustCompile(`(?i)&#x([a-f0-9]{1,2});`)

func decodeHtmlHex(logger logr.Logger, input []byte) []byte {
decoded := make([]byte, 0, len(input))
lastIndex := 0

for _, match := range hexEntityPat.FindAllSubmatchIndex(input, -1) {
startIndex := match[0]
endIndex := match[1]
hexStartIndex := match[2]
hexEndIndex := match[3]

// Copy the part of the input until the start of the entity
decoded = append(decoded, input[lastIndex:startIndex]...)

// Parse the hexadecimal value to an integer
num, err := strconv.ParseInt(string(input[hexStartIndex:hexEndIndex]), 16, 32)
if err != nil {
continue
}

// Append the decoded byte
if num < 0 || num > 255 {
logger.Error(errors.New("invalid hex byte"), "Unable to decode HTML entity", "match", input[hexStartIndex:hexEndIndex], "byte", num)
continue
}
decoded = append(decoded, byte(num))

lastIndex = endIndex
}

// Append the remaining part of the input
decoded = append(decoded, input[lastIndex:]...)

return decoded
}

var (
// https://www.compart.com/en/unicode/html
namedEntityMap = map[string][]byte{
"&tab;": []byte(" "),
"&newline;": []byte("\n"),
"&excl;": []byte("!"),
"&quot;": []byte(`"`),
"&num;": []byte("#"),
"&dollar;": []byte("$"),
"&percnt;": []byte("%"),
"&amp;": []byte("&"),
"&apos;": []byte("'"),
"&lpar;": []byte("("),
"&rpar;": []byte(")"),
"&ast;": []byte("*"),
"&plus;": []byte("+"),
"&comma;": []byte(","),
"&period;": []byte("."),
"&sol;": []byte("/"),
"&colon;": []byte(":"),
"&semi;": []byte(";"),
"&lt;": []byte("<"),
"&equals;": []byte("="),
"&gt;": []byte(">"),
"&quest;": []byte("?"),
"&commat;": []byte("@"),
"&lsqb;": []byte("["),
"&bsol;": []byte("\\"),
"&rsqb;": []byte("]"),
"&hat;": []byte("^"),
"&underbar;": []byte("_"),
"&diacriticalgrave;": []byte("`"),
"&lcub;": []byte("{"),
"&verticalline;": []byte("|"),
"&rcub;": []byte("}"),
"&nonbreakingspace;": []byte(" "),
}
namedEntityPat = func() *regexp.Regexp {
return regexp.MustCompile(
"(?i)(" + strings.Join(maps.Keys(namedEntityMap), "|") + ")")
}()
)

func decodeNamedEntities(_ logr.Logger, input []byte) []byte {
return namedEntityPat.ReplaceAllFunc(input, func(match []byte) []byte {
m := strings.ToLower(string(match))
if replacement, ok := namedEntityMap[m]; ok {
return replacement
}
return match
})
}
Loading

0 comments on commit ac02868

Please sign in to comment.