Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create decoder for HTML entities #44

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hack/snifftest/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ func main() {
for chunk := range chunksChan {
for name, scanner := range selectedScanners {
for _, dec := range allDecoders {
decoded := dec.FromChunk(&sources.Chunk{Data: chunk.Data})
decoded := dec.FromChunk(ctx, &sources.Chunk{Data: chunk.Data})
if decoded != nil {
foundKeyword := false
for _, kw := range scanner.Keywords() {
Expand Down
3 changes: 2 additions & 1 deletion pkg/decoders/base64.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"encoding/base64"
"unicode"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand All @@ -31,7 +32,7 @@ func (d *Base64) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_BASE64
}

func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
func (d *Base64) FromChunk(_ context.Context, chunk *sources.Chunk) *DecodableChunk {
decodableChunk := &DecodableChunk{Chunk: chunk, DecoderType: d.Type()}
encodedSubstrings := getSubstringsOfCharacterSet(chunk.Data, 20, b64CharsetMapping, b64EndChars)
decodedSubstrings := make(map[string][]byte)
Expand Down
9 changes: 5 additions & 4 deletions pkg/decoders/base64_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (

"github.com/kylelemons/godebug/pretty"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand Down Expand Up @@ -134,7 +135,7 @@ func TestBase64_FromChunk(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &Base64{}
got := d.FromChunk(tt.chunk)
got := d.FromChunk(context.Background(), tt.chunk)
if tt.want != nil {
if got == nil {
t.Fatal("got nil, did not want nil")
Expand All @@ -156,7 +157,7 @@ func BenchmarkFromChunkSmall(b *testing.B) {
data := detectors.MustGetBenchmarkData()["small"]

for n := 0; n < b.N; n++ {
d.FromChunk(&sources.Chunk{Data: data})
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
}
}

Expand All @@ -165,7 +166,7 @@ func BenchmarkFromChunkMedium(b *testing.B) {
data := detectors.MustGetBenchmarkData()["medium"]

for n := 0; n < b.N; n++ {
d.FromChunk(&sources.Chunk{Data: data})
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
}
}

Expand All @@ -174,6 +175,6 @@ func BenchmarkFromChunkLarge(b *testing.B) {
data := detectors.MustGetBenchmarkData()["big"]

for n := 0; n < b.N; n++ {
d.FromChunk(&sources.Chunk{Data: data})
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
}
}
7 changes: 5 additions & 2 deletions pkg/decoders/decoders.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package decoders

import (
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand All @@ -12,6 +13,7 @@ func DefaultDecoders() []Decoder {
&Base64{},
&UTF16{},
&EscapedUnicode{},
&HtmlEntity{},
}
}

Expand All @@ -23,21 +25,22 @@ type DecodableChunk struct {
}

type Decoder interface {
FromChunk(chunk *sources.Chunk) *DecodableChunk
FromChunk(ctx context.Context, chunk *sources.Chunk) *DecodableChunk
Type() detectorspb.DecoderType
}

// Fuzz is an entrypoint for go-fuzz, which is an AFL-style fuzzing tool.
// This one attempts to uncover any panics during decoding.
func Fuzz(data []byte) int {
decoded := false
ctx := context.Background()
for i, decoder := range DefaultDecoders() {
// Skip the first decoder (plain), because it will always decode and give
// priority to the input (return 1).
if i == 0 {
continue
}
chunk := decoder.FromChunk(&sources.Chunk{Data: data})
chunk := decoder.FromChunk(ctx, &sources.Chunk{Data: data})
if chunk != nil {
decoded = true
}
Expand Down
5 changes: 3 additions & 2 deletions pkg/decoders/escaped_unicode.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strconv"
"unicode/utf8"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand All @@ -18,7 +19,7 @@ var _ Decoder = (*EscapedUnicode)(nil)
// https://dencode.com/en/string/unicode-escape
var (
// Standard Unicode notation.
//https://unicode.org/standard/principles.html
// https://unicode.org/standard/principles.html
codePointPat = regexp.MustCompile(`\bU\+([a-fA-F0-9]{4}).?`)

// Common escape sequence used in programming languages.
Expand All @@ -29,7 +30,7 @@ func (d *EscapedUnicode) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_ESCAPED_UNICODE
}

func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
func (d *EscapedUnicode) FromChunk(_ context.Context, chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/decoders/escaped_unicode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (

"github.com/kylelemons/godebug/pretty"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

Expand Down Expand Up @@ -68,7 +69,7 @@ func TestUnicodeEscape_FromChunk(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &EscapedUnicode{}
got := d.FromChunk(tt.chunk)
got := d.FromChunk(context.Background(), tt.chunk)
if tt.want != nil {
if got == nil {
t.Fatal("got nil, did not want nil")
Expand Down
219 changes: 219 additions & 0 deletions pkg/decoders/html_entity.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
package decoders

import (
"bytes"
"errors"
"regexp"
"strconv"
"strings"
"sync"

ahocorasick "github.com/BobuSumisu/aho-corasick"
"github.com/go-logr/logr"
"golang.org/x/exp/maps"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

// HtmlEntity decodes characters that are encoded as decimal, hexadecimal, or named entities.
// https://www.ee.ucl.ac.uk/~mflanaga/java/HTMLandASCIItableC1.html
type HtmlEntity struct{}

var (
_ Decoder = (*HtmlEntity)(nil)

once sync.Once
htmlTrie *ahocorasick.Trie
)

func init() {
// Use Aho-Corasick to pre-filter potential matches.
once.Do(func() {
keywords := map[string]struct{}{
`&#`: {}, // decimal
`&#x`: {}, // hex
}
for entity := range namedEntityMap {
keywords[strings.ToLower(entity)] = struct{}{}
}
htmlTrie = ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(keywords)).Build()
})
}

func (d *HtmlEntity) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_HTML
}

func (d *HtmlEntity) FromChunk(ctx context.Context, chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
} else if m := htmlTrie.MatchFirst(chunk.Data); m == nil {
return nil
}

var (
logger = ctx.Logger().WithName("decoders.html")
// Necessary to avoid data races.
chunkData = bytes.Clone(chunk.Data)
matched = false
)
if namedEntityPat.Match(chunkData) {
matched = true
chunkData = decodeNamedEntities(logger, chunkData)
}
if decimalEntityPat.Match(chunkData) {
matched = true
chunkData = decodeHtmlDecimal(logger, chunkData)
}
if hexEntityPat.Match(chunkData) {
matched = true
chunkData = decodeHtmlHex(logger, chunkData)
}

if matched {
return &DecodableChunk{
DecoderType: d.Type(),
Chunk: &sources.Chunk{
Data: chunkData,
SourceName: chunk.SourceName,
SourceID: chunk.SourceID,
JobID: chunk.JobID,
SecretID: chunk.SecretID,
SourceMetadata: chunk.SourceMetadata,
SourceType: chunk.SourceType,
Verify: chunk.Verify,
},
}
} else {
return nil
}
}

// `A` = `&#65;`
var decimalEntityPat = regexp.MustCompile(`&#(\d{1,3});`)

func decodeHtmlDecimal(logger logr.Logger, input []byte) []byte {
decoded := make([]byte, 0, len(input))
lastIndex := 0

for _, match := range decimalEntityPat.FindAllSubmatchIndex(input, -1) {
startIndex := match[0]
endIndex := match[1]
decStartIndex := match[2]
decEndIndex := match[3]

// Copy the part of the input until the start of the entity
decoded = append(decoded, input[lastIndex:startIndex]...)

num, err := strconv.Atoi(string(input[decStartIndex:decEndIndex]))
if err != nil {
continue
}

// Append the decoded byte
if num < 0 || num > 255 {
logger.Error(errors.New("invalid decimal byte"), "Unable to decode HTML entity", "match", input[decStartIndex:decEndIndex], "byte", num)
continue
}
decoded = append(decoded, byte(num))

Check failure

Code scanning / CodeQL

Incorrect conversion between integer types

Incorrect conversion of an integer with architecture-dependent bit size from [strconv.Atoi](1) to a lower bit size type uint8 without an upper bound check. Incorrect conversion of an integer with architecture-dependent bit size from [strconv.Atoi](1) to a lower bit size type uint8 without an upper bound check.

Copilot Autofix AI about 1 month ago

To fix the problem, we need to ensure that the integer value parsed from the string is within the valid range for a byte (0-255) before performing the conversion. This can be done by adding a bounds check after parsing the integer and before converting it to a byte.

  1. Parse the integer using strconv.Atoi.
  2. Check if the parsed integer is within the range of 0 to 255.
  3. If the integer is within the valid range, convert it to a byte.
  4. If the integer is outside the valid range, handle the error appropriately (e.g., skip the conversion or use a default value).
Suggested changeset 1
pkg/decoders/html_entity.go

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/pkg/decoders/html_entity.go b/pkg/decoders/html_entity.go
--- a/pkg/decoders/html_entity.go
+++ b/pkg/decoders/html_entity.go
@@ -110,4 +110,7 @@
 
-		// Append the decoded byte
-		decoded = append(decoded, byte(num))
+		// Check if the parsed number is within the valid range for a byte
+		if num >= 0 && num <= 255 {
+			// Append the decoded byte
+			decoded = append(decoded, byte(num))
+		}
 
EOF
@@ -110,4 +110,7 @@

// Append the decoded byte
decoded = append(decoded, byte(num))
// Check if the parsed number is within the valid range for a byte
if num >= 0 && num <= 255 {
// Append the decoded byte
decoded = append(decoded, byte(num))
}

Copilot is powered by AI and may make mistakes. Always verify output.
Unable to commit as this autofix suggestion is now outdated
Positive Feedback
Negative Feedback

Provide additional feedback

Please help us improve GitHub Copilot by sharing more details about this comment.

Please select one or more of the options
lastIndex = endIndex
}

// Append the remaining part of the input
decoded = append(decoded, input[lastIndex:]...)

return decoded
}

// `A` = `&#x1;`
var hexEntityPat = regexp.MustCompile(`(?i)&#x([a-f0-9]{1,2});`)

func decodeHtmlHex(logger logr.Logger, input []byte) []byte {
decoded := make([]byte, 0, len(input))
lastIndex := 0

for _, match := range hexEntityPat.FindAllSubmatchIndex(input, -1) {
startIndex := match[0]
endIndex := match[1]
hexStartIndex := match[2]
hexEndIndex := match[3]

// Copy the part of the input until the start of the entity
decoded = append(decoded, input[lastIndex:startIndex]...)

// Parse the hexadecimal value to an integer
num, err := strconv.ParseInt(string(input[hexStartIndex:hexEndIndex]), 16, 32)
if err != nil {
continue
}

// Append the decoded byte
if num < 0 || num > 255 {
logger.Error(errors.New("invalid hex byte"), "Unable to decode HTML entity", "match", input[hexStartIndex:hexEndIndex], "byte", num)
continue
}
decoded = append(decoded, byte(num))

Check failure

Code scanning / CodeQL

Incorrect conversion between integer types

Incorrect conversion of a signed 32-bit integer from [strconv.ParseInt](1) to a lower bit size type uint8 without an upper bound check.

Copilot Autofix AI about 1 month ago

To fix the problem, we need to ensure that the parsed integer value is within the valid range for a byte (0 to 255) before performing the conversion. This can be done by adding a bounds check after parsing the integer and before converting it to a byte.

  • We will add a check to ensure that the parsed integer is within the range of 0 to 255.
  • If the parsed integer is outside this range, we will skip the conversion and continue with the next match.
  • This change will be made in the decodeHtmlHex function in the file pkg/decoders/html_entity.go.
Suggested changeset 1
pkg/decoders/html_entity.go

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/pkg/decoders/html_entity.go b/pkg/decoders/html_entity.go
--- a/pkg/decoders/html_entity.go
+++ b/pkg/decoders/html_entity.go
@@ -144,2 +144,7 @@
 
+		// Check if the parsed number is within the valid range for a byte
+		if num < 0 || num > 255 {
+			continue
+		}
+
 		// Append the decoded byte
EOF
@@ -144,2 +144,7 @@

// Check if the parsed number is within the valid range for a byte
if num < 0 || num > 255 {
continue
}

// Append the decoded byte
Copilot is powered by AI and may make mistakes. Always verify output.
Unable to commit as this autofix suggestion is now outdated
Positive Feedback
Negative Feedback

Provide additional feedback

Please help us improve GitHub Copilot by sharing more details about this comment.

Please select one or more of the options

lastIndex = endIndex
}

// Append the remaining part of the input
decoded = append(decoded, input[lastIndex:]...)

return decoded
}

var (
// https://www.compart.com/en/unicode/html
namedEntityMap = map[string][]byte{
"&tab;": []byte(" "),
"&newline;": []byte("\n"),
"&excl;": []byte("!"),
"&quot;": []byte(`"`),
"&num;": []byte("#"),
"&dollar;": []byte("$"),
"&percnt;": []byte("%"),
"&amp;": []byte("&"),
"&apos;": []byte("'"),
"&lpar;": []byte("("),
"&rpar;": []byte(")"),
"&ast;": []byte("*"),
"&plus;": []byte("+"),
"&comma;": []byte(","),
"&period;": []byte("."),
"&sol;": []byte("/"),
"&colon;": []byte(":"),
"&semi;": []byte(";"),
"&lt;": []byte("<"),
"&equals;": []byte("="),
"&gt;": []byte(">"),
"&quest;": []byte("?"),
"&commat;": []byte("@"),
"&lsqb;": []byte("["),
"&bsol;": []byte("\\"),
"&rsqb;": []byte("]"),
"&hat;": []byte("^"),
"&underbar;": []byte("_"),
"&diacriticalgrave;": []byte("`"),
"&lcub;": []byte("{"),
"&verticalline;": []byte("|"),
"&rcub;": []byte("}"),
"&nonbreakingspace;": []byte(" "),
}
namedEntityPat = func() *regexp.Regexp {
return regexp.MustCompile(
"(?i)(" + strings.Join(maps.Keys(namedEntityMap), "|") + ")")
}()
)

func decodeNamedEntities(_ logr.Logger, input []byte) []byte {
return namedEntityPat.ReplaceAllFunc(input, func(match []byte) []byte {
m := strings.ToLower(string(match))
if replacement, ok := namedEntityMap[m]; ok {
return replacement
}
return match
})
}
Loading
Loading