From 6a98dcc6552a97448033e8d22da908676a8cd81c Mon Sep 17 00:00:00 2001 From: Richard Gomez Date: Sun, 10 Mar 2024 11:53:28 -0400 Subject: [PATCH] feat(decoders): HTML entities --- pkg/decoders/decoders.go | 1 + pkg/decoders/html_entity.go | 168 +++++++++++++++++++++++++++++++ pkg/decoders/html_entity_test.go | 105 +++++++++++++++++++ 3 files changed, 274 insertions(+) create mode 100644 pkg/decoders/html_entity.go create mode 100644 pkg/decoders/html_entity_test.go diff --git a/pkg/decoders/decoders.go b/pkg/decoders/decoders.go index 87491a67eb77..454c2e5c639d 100644 --- a/pkg/decoders/decoders.go +++ b/pkg/decoders/decoders.go @@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder { &Base64{}, &UTF16{}, &EscapedUnicode{}, + &HtmlEntity{}, } } diff --git a/pkg/decoders/html_entity.go b/pkg/decoders/html_entity.go new file mode 100644 index 000000000000..1df01e8aec7e --- /dev/null +++ b/pkg/decoders/html_entity.go @@ -0,0 +1,168 @@ +package decoders + +import ( + "regexp" + "strconv" + "strings" + + "golang.org/x/exp/maps" + + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" + "github.com/trufflesecurity/trufflehog/v3/pkg/sources" +) + +// HtmlEntity decodes characters that are encoded as decimal, hexadecimal, or named entities. +// https://www.ee.ucl.ac.uk/~mflanaga/java/HTMLandASCIItableC1.html +type HtmlEntity struct{} + +var _ Decoder = (*HtmlEntity)(nil) + +func (d *HtmlEntity) FromChunk(chunk *sources.Chunk) *DecodableChunk { + if chunk == nil || len(chunk.Data) == 0 { + return nil + } + + matched := false + if namedEntityPat.Match(chunk.Data) { + matched = true + chunk.Data = decodeNamedEntities(chunk.Data) + } + if decimalEntityPat.Match(chunk.Data) { + matched = true + chunk.Data = decodeHtmlDecimal(chunk.Data) + } + if hexEntityPat.Match(chunk.Data) { + matched = true + chunk.Data = decodeHtmlHex(chunk.Data) + } + + if matched { + decodableChunk := &DecodableChunk{ + DecoderType: detectorspb.DecoderType_ESCAPED_UNICODE, + Chunk: chunk, + } + return decodableChunk + } else { + return nil + } +} + +// `A` = `A` +var decimalEntityPat = regexp.MustCompile(`&#(\d{1,3});`) + +func decodeHtmlDecimal(input []byte) []byte { + decoded := make([]byte, 0, len(input)) + lastIndex := 0 + + for _, match := range decimalEntityPat.FindAllSubmatchIndex(input, -1) { + startIndex := match[0] + endIndex := match[1] + decStartIndex := match[2] + decEndIndex := match[3] + + // Copy the part of the input until the start of the entity + decoded = append(decoded, input[lastIndex:startIndex]...) + + num, err := strconv.Atoi(string(input[decStartIndex:decEndIndex])) + if err != nil { + continue + } + + // Append the decoded byte + decoded = append(decoded, byte(num)) + + lastIndex = endIndex + } + + // Append the remaining part of the input + decoded = append(decoded, input[lastIndex:]...) + + return decoded +} + +// `A` = `` +var hexEntityPat = regexp.MustCompile(`(?i)&#x([a-f0-9]{1,2});`) + +func decodeHtmlHex(input []byte) []byte { + decoded := make([]byte, 0, len(input)) + lastIndex := 0 + + for _, match := range hexEntityPat.FindAllSubmatchIndex(input, -1) { + startIndex := match[0] + endIndex := match[1] + hexStartIndex := match[2] + hexEndIndex := match[3] + + // Copy the part of the input until the start of the entity + decoded = append(decoded, input[lastIndex:startIndex]...) + + // Parse the hexadecimal value to an integer + num, err := strconv.ParseInt(string(input[hexStartIndex:hexEndIndex]), 16, 32) + if err != nil { + continue + } + + // Append the decoded byte + decoded = append(decoded, byte(num)) + + lastIndex = endIndex + } + + // Append the remaining part of the input + decoded = append(decoded, input[lastIndex:]...) + + return decoded +} + +var ( + // https://www.compart.com/en/unicode/html + namedEntityMap = map[string][]byte{ + "&tab;": []byte(" "), + "&newline;": []byte("\n"), + "!": []byte("!"), + """: []byte(`"`), + "#": []byte("#"), + "$": []byte("$"), + "%": []byte("%"), + "&": []byte("&"), + "'": []byte("'"), + "(": []byte("("), + ")": []byte(")"), + "*": []byte("*"), + "+": []byte("+"), + ",": []byte(","), + ".": []byte("."), + "/": []byte("/"), + ":": []byte(":"), + ";": []byte(";"), + "<": []byte("<"), + "=": []byte("="), + ">": []byte(">"), + "?": []byte("?"), + "@": []byte("@"), + "[": []byte("["), + "\": []byte("\\"), + "]": []byte("]"), + "&hat;": []byte("^"), + "&underbar;": []byte("_"), + "&diacriticalgrave;": []byte("`"), + "{": []byte("{"), + "&verticalline;": []byte("|"), + "}": []byte("}"), + "&nonbreakingspace;": []byte(" "), + } + namedEntityPat = func() *regexp.Regexp { + return regexp.MustCompile( + "(?i)(" + strings.Join(maps.Keys(namedEntityMap), "|") + ")") + }() +) + +func decodeNamedEntities(input []byte) []byte { + return namedEntityPat.ReplaceAllFunc(input, func(match []byte) []byte { + m := strings.ToLower(string(match)) + if replacement, ok := namedEntityMap[m]; ok { + return replacement + } + return match + }) +} diff --git a/pkg/decoders/html_entity_test.go b/pkg/decoders/html_entity_test.go new file mode 100644 index 000000000000..9a1b318888c1 --- /dev/null +++ b/pkg/decoders/html_entity_test.go @@ -0,0 +1,105 @@ +package decoders + +import ( + "testing" + + "github.com/kylelemons/godebug/pretty" + + "github.com/trufflesecurity/trufflehog/v3/pkg/sources" +) + +func TestHtmlEntity_FromChunk(t *testing.T) { + tests := []struct { + name string + chunk *sources.Chunk + want *sources.Chunk + wantErr bool + }{ + //  + { + name: "[decimal] all encoded", + chunk: &sources.Chunk{ + Data: []byte("token: "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""), + }, + want: &sources.Chunk{ + Data: []byte("token: \"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\""), + }, + }, + { + name: "[decimal] mixed content", + chunk: &sources.Chunk{ + Data: []byte(`token: "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"`), + }, + want: &sources.Chunk{ + Data: []byte(`token: "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"`), + }, + }, + //  + { + name: "[hex] all encoded", + chunk: &sources.Chunk{ + Data: []byte("token: "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""), + }, + want: &sources.Chunk{ + Data: []byte(`token: "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"`), + }, + }, + { + name: "[hex] mixed content", + chunk: &sources.Chunk{ + Data: []byte(`token: "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"`), + }, + want: &sources.Chunk{ + Data: []byte(`token: "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"`), + }, + }, + // " + { + name: "[named] all encoded", + chunk: &sources.Chunk{ + Data: []byte(" !"#$%&'()*+,./:;<=>?@[\]^_`{|} "), + }, + want: &sources.Chunk{ + Data: []byte("\t\n!\"#$%&'()*+,./:;<=>?@[\\]^_`{|} "), + }, + }, + { + name: "[named] mixed content", + chunk: &sources.Chunk{ + Data: []byte("\t !"#$%&'()*+,./:;<=>?@[\\]^_`{|} "), + }, + want: &sources.Chunk{ + Data: []byte("\t\n!\"#$%&'()*+,./:;<=>?@[\\]^_`{|} "), + }, + }, + + // nothing + { + name: "no escaped", + chunk: &sources.Chunk{ + Data: []byte(`-//npm.fontawesome.com/:_authToken=12345678-2323-1111-1111-12345670B312 ++//npm.fontawesome.com/:_authToken=REMOVED_TOKEN`), + }, + want: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + d := &HtmlEntity{} + got := d.FromChunk(tt.chunk) + if tt.want != nil { + if got == nil { + t.Fatal("got nil, did not want nil") + } + if diff := pretty.Compare(string(tt.want.Data), string(got.Data)); diff != "" { + t.Errorf("HtmlEntity.FromChunk() %s diff: (-want +got)\n%s", tt.name, diff) + } + } else { + if got != nil { + t.Error("Expected nil chunk") + } + } + }) + } +}