[fix] - Improve UTF8 decoder's handling of non-printable characters (#…

…3588) * Avoid removing non-printable characters when decoding * use byte slice * remove new line --------- Co-authored-by: Miccah <[email protected]>
trufflesecurity · Nov 15, 2024 · c6abe85 · c6abe85
1 parent cca7e6b
commit c6abe85
Show file tree

Hide file tree

Showing 3 changed files with 350 additions and 35 deletions.
diff --git a/pkg/decoders/utf16.go b/pkg/decoders/utf16.go
@@ -37,12 +37,12 @@ func utf16ToUTF8(b []byte) ([]byte, error) {
 	var bufBE, bufLE bytes.Buffer
 	for i := 0; i < len(b)-1; i += 2 {
 		if r := rune(binary.BigEndian.Uint16(b[i:])); b[i] == 0 && utf8.ValidRune(r) {
-			if isValidByte(byte(r)) {
+			if isPrintableByte(byte(r)) {
 				bufBE.WriteRune(r)
 			}
 		}
 		if r := rune(binary.LittleEndian.Uint16(b[i:])); b[i+1] == 0 && utf8.ValidRune(r) {
-			if isValidByte(byte(r)) {
+			if isPrintableByte(byte(r)) {
 				bufLE.WriteRune(r)
 			}
 		}

diff --git a/pkg/decoders/utf8.go b/pkg/decoders/utf8.go
@@ -1,7 +1,6 @@
 package decoders
 
 import (
-	"bytes"
 	"unicode/utf8"
 
 	"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
@@ -29,35 +28,58 @@ func (d *UTF8) FromChunk(chunk *sources.Chunk) *DecodableChunk {
 	return decodableChunk
 }
 
-// extractSubstrings performs similarly to the strings binutil,
-// extacting contigous portions of printable characters that we care
-// about from some bytes
-func extractSubstrings(b []byte) []byte {
+// utf8ReplacementBytes holds the UTF-8 encoded form of the Unicode replacement character (U+FFFD).
+// This is pre-computed since it's used frequently when replacing invalid UTF-8 sequences
+// and control characters.
+var utf8ReplacementBytes = []byte(string(utf8.RuneError))
 
-	field := make([]byte, len(b))
-	fieldLen := 0
-	buf := &bytes.Buffer{}
-	for i, c := range b {
-		if isValidByte(c) {
-			field[fieldLen] = c
-			fieldLen++
-		} else {
-			if fieldLen > 5 {
-				buf.Write(field[:fieldLen])
+// extractSubstrings sanitizes byte sequences to ensure consistent handling of malformed input
+// while maintaining readable content. It handles ASCII and UTF-8 data as follows:
+//
+// For ASCII range (0-127): preserves printable characters (32-126) while replacing
+// control characters with the UTF-8 replacement character.
+// https://cs.opensource.google/go/go/+/refs/tags/go1.23.3:src/unicode/utf8/utf8.go;l=16
+//
+// For multi-byte sequences: preserves valid UTF-8 as-is, while invalid sequences
+// are replaced with a single UTF-8 replacement character.
+func extractSubstrings(b []byte) []byte {
+	dataLen := len(b)
+	buf := make([]byte, 0, dataLen)
+	for idx := 0; idx < dataLen; {
+		// If it's ASCII, handle separately.
+		// This is faster than decoding for common cases.
+		if b[idx] < utf8.RuneSelf {
+			if isPrintableByte(b[idx]) {
+				buf = append(buf, b[idx])
+			} else {
+				buf = append(buf, utf8ReplacementBytes...)
 			}
-			fieldLen = 0
+			idx++
+			continue
 		}
 
-		if i == len(b)-1 && fieldLen > 5 {
-			buf.Write(field[:fieldLen])
+		r, size := utf8.DecodeRune(b[idx:])
+		if r == utf8.RuneError {
+			// Collapse any malformed sequence into a single replacement character
+			// rather than replacing each byte individually.
+			buf = append(buf, utf8ReplacementBytes...)
+			idx++
+		} else {
+			// Keep valid multi-byte UTF-8 sequences intact to preserve unicode characters.
+			buf = append(buf, b[idx:idx+size]...)
+			idx += size
 		}
 	}
 
-	return buf.Bytes()
+	return buf
 }
 
-func isValidByte(c byte) bool {
-	// https://www.rapidtables.com/code/text/ascii-table.html
-	// split on anything that is not ascii space through tilde
-	return c > 31 && c < 127
-}
+// isPrintableByte reports whether a byte represents a printable ASCII character
+// using a fast byte-range check. This avoids the overhead of utf8.DecodeRune
+// for the common case of ASCII characters (0-127), since we know any byte < 128
+// represents a complete ASCII character and doesn't need UTF-8 decoding.
+// This includes letters, digits, punctuation, and symbols, but excludes control characters.
+// The upper bound is 127 (not 128) because 127 is the DEL control character.
+//
+// https://www.rapidtables.com/code/text/ascii-table.html
+func isPrintableByte(c byte) bool { return c > 31 && c < 127 }