-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Loading status checks…
feat(decoders): HTML entities
Showing
5 changed files
with
1,316 additions
and
1,033 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder { | |
&Base64{}, | ||
&UTF16{}, | ||
&EscapedUnicode{}, | ||
&HtmlEntity{}, | ||
} | ||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
package decoders | ||
|
||
import ( | ||
"regexp" | ||
"strconv" | ||
"strings" | ||
|
||
"golang.org/x/exp/maps" | ||
|
||
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" | ||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources" | ||
) | ||
|
||
// HtmlEntity decodes characters that are encoded as decimal, hexadecimal, or named entities. | ||
// https://www.ee.ucl.ac.uk/~mflanaga/java/HTMLandASCIItableC1.html | ||
type HtmlEntity struct{} | ||
|
||
var _ Decoder = (*HtmlEntity)(nil) | ||
|
||
func (d *HtmlEntity) Type() detectorspb.DecoderType { | ||
return detectorspb.DecoderType_HTML | ||
} | ||
|
||
func (d *HtmlEntity) FromChunk(chunk *sources.Chunk) *DecodableChunk { | ||
if chunk == nil || len(chunk.Data) == 0 { | ||
return nil | ||
} | ||
|
||
matched := false | ||
if namedEntityPat.Match(chunk.Data) { | ||
matched = true | ||
chunk.Data = decodeNamedEntities(chunk.Data) | ||
} | ||
if decimalEntityPat.Match(chunk.Data) { | ||
matched = true | ||
chunk.Data = decodeHtmlDecimal(chunk.Data) | ||
} | ||
if hexEntityPat.Match(chunk.Data) { | ||
matched = true | ||
chunk.Data = decodeHtmlHex(chunk.Data) | ||
} | ||
|
||
if matched { | ||
decodableChunk := &DecodableChunk{ | ||
DecoderType: detectorspb.DecoderType_ESCAPED_UNICODE, | ||
Chunk: chunk, | ||
} | ||
return decodableChunk | ||
} else { | ||
return nil | ||
} | ||
} | ||
|
||
// `A` = `A` | ||
var decimalEntityPat = regexp.MustCompile(`&#(\d{1,3});`) | ||
|
||
func decodeHtmlDecimal(input []byte) []byte { | ||
decoded := make([]byte, 0, len(input)) | ||
lastIndex := 0 | ||
|
||
for _, match := range decimalEntityPat.FindAllSubmatchIndex(input, -1) { | ||
startIndex := match[0] | ||
endIndex := match[1] | ||
decStartIndex := match[2] | ||
decEndIndex := match[3] | ||
|
||
// Copy the part of the input until the start of the entity | ||
decoded = append(decoded, input[lastIndex:startIndex]...) | ||
|
||
num, err := strconv.Atoi(string(input[decStartIndex:decEndIndex])) | ||
if err != nil { | ||
continue | ||
} | ||
|
||
// Append the decoded byte | ||
decoded = append(decoded, byte(num)) | ||
|
||
lastIndex = endIndex | ||
} | ||
|
||
// Append the remaining part of the input | ||
decoded = append(decoded, input[lastIndex:]...) | ||
|
||
return decoded | ||
} | ||
|
||
// `A` = `` | ||
var hexEntityPat = regexp.MustCompile(`(?i)&#x([a-f0-9]{1,2});`) | ||
|
||
func decodeHtmlHex(input []byte) []byte { | ||
decoded := make([]byte, 0, len(input)) | ||
lastIndex := 0 | ||
|
||
for _, match := range hexEntityPat.FindAllSubmatchIndex(input, -1) { | ||
startIndex := match[0] | ||
endIndex := match[1] | ||
hexStartIndex := match[2] | ||
hexEndIndex := match[3] | ||
|
||
// Copy the part of the input until the start of the entity | ||
decoded = append(decoded, input[lastIndex:startIndex]...) | ||
|
||
// Parse the hexadecimal value to an integer | ||
num, err := strconv.ParseInt(string(input[hexStartIndex:hexEndIndex]), 16, 32) | ||
if err != nil { | ||
continue | ||
} | ||
|
||
// Append the decoded byte | ||
decoded = append(decoded, byte(num)) | ||
|
||
lastIndex = endIndex | ||
} | ||
|
||
// Append the remaining part of the input | ||
decoded = append(decoded, input[lastIndex:]...) | ||
|
||
return decoded | ||
} | ||
|
||
var ( | ||
// https://www.compart.com/en/unicode/html | ||
namedEntityMap = map[string][]byte{ | ||
"&tab;": []byte(" "), | ||
"&newline;": []byte("\n"), | ||
"!": []byte("!"), | ||
""": []byte(`"`), | ||
"#": []byte("#"), | ||
"$": []byte("$"), | ||
"%": []byte("%"), | ||
"&": []byte("&"), | ||
"'": []byte("'"), | ||
"(": []byte("("), | ||
")": []byte(")"), | ||
"*": []byte("*"), | ||
"+": []byte("+"), | ||
",": []byte(","), | ||
".": []byte("."), | ||
"/": []byte("/"), | ||
":": []byte(":"), | ||
";": []byte(";"), | ||
"<": []byte("<"), | ||
"=": []byte("="), | ||
">": []byte(">"), | ||
"?": []byte("?"), | ||
"@": []byte("@"), | ||
"[": []byte("["), | ||
"\": []byte("\\"), | ||
"]": []byte("]"), | ||
"&hat;": []byte("^"), | ||
"&underbar;": []byte("_"), | ||
"&diacriticalgrave;": []byte("`"), | ||
"{": []byte("{"), | ||
"&verticalline;": []byte("|"), | ||
"}": []byte("}"), | ||
"&nonbreakingspace;": []byte(" "), | ||
} | ||
namedEntityPat = func() *regexp.Regexp { | ||
return regexp.MustCompile( | ||
"(?i)(" + strings.Join(maps.Keys(namedEntityMap), "|") + ")") | ||
}() | ||
) | ||
|
||
func decodeNamedEntities(input []byte) []byte { | ||
return namedEntityPat.ReplaceAllFunc(input, func(match []byte) []byte { | ||
m := strings.ToLower(string(match)) | ||
if replacement, ok := namedEntityMap[m]; ok { | ||
return replacement | ||
} | ||
return match | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
package decoders | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/kylelemons/godebug/pretty" | ||
|
||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources" | ||
) | ||
|
||
func TestHtmlEntity_FromChunk(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
chunk *sources.Chunk | ||
want *sources.Chunk | ||
wantErr bool | ||
}{ | ||
//  | ||
{ | ||
name: "[decimal] all encoded", | ||
chunk: &sources.Chunk{ | ||
Data: []byte("token: "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""), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte("token: \"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\""), | ||
}, | ||
}, | ||
{ | ||
name: "[decimal] mixed content", | ||
chunk: &sources.Chunk{ | ||
Data: []byte(`token: "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"`), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte(`token: "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"`), | ||
}, | ||
}, | ||
//  | ||
{ | ||
name: "[hex] all encoded", | ||
chunk: &sources.Chunk{ | ||
Data: []byte("token: "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""), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte(`token: "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"`), | ||
}, | ||
}, | ||
{ | ||
name: "[hex] mixed content", | ||
chunk: &sources.Chunk{ | ||
Data: []byte(`token: "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"`), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte(`token: "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"`), | ||
}, | ||
}, | ||
// " | ||
{ | ||
name: "[named] all encoded", | ||
chunk: &sources.Chunk{ | ||
Data: []byte("	
!"#$%&'()*+,./:;<=>?@[\]^_`{|} "), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte("\t\n!\"#$%&'()*+,./:;<=>?@[\\]^_`{|} "), | ||
}, | ||
}, | ||
{ | ||
name: "[named] mixed content", | ||
chunk: &sources.Chunk{ | ||
Data: []byte("\t
!"#$%&'()*+,./:;<=>?@[\\]^_`{|} "), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte("\t\n!\"#$%&'()*+,./:;<=>?@[\\]^_`{|} "), | ||
}, | ||
}, | ||
|
||
// nothing | ||
{ | ||
name: "no escaped", | ||
chunk: &sources.Chunk{ | ||
Data: []byte(`-//npm.fontawesome.com/:_authToken=12345678-2323-1111-1111-12345670B312 | ||
+//npm.fontawesome.com/:_authToken=REMOVED_TOKEN`), | ||
}, | ||
want: nil, | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
d := &HtmlEntity{} | ||
got := d.FromChunk(tt.chunk) | ||
if tt.want != nil { | ||
if got == nil { | ||
t.Fatal("got nil, did not want nil") | ||
} | ||
if diff := pretty.Compare(string(tt.want.Data), string(got.Data)); diff != "" { | ||
t.Errorf("HtmlEntity.FromChunk() %s diff: (-want +got)\n%s", tt.name, diff) | ||
} | ||
} else { | ||
if got != nil { | ||
t.Error("Expected nil chunk") | ||
} | ||
} | ||
}) | ||
} | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ enum DecoderType { | |
BASE64 = 2; | ||
UTF16 = 3; | ||
ESCAPED_UNICODE = 4; | ||
HTML = 5; | ||
} | ||
|
||
enum DetectorType { | ||
|