open-telemetry · pellared · Dec 12, 2024 · Dec 11, 2024 · Dec 11, 2024
@@ -33,6 +33,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 - Fix inconsistent request body closing in `go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp`. (#5954)
 - Fix invalid exemplar keys in `go.opentelemetry.io/otel/exporters/prometheus`. (#5995)
 - Fix attribute value truncation in `go.opentelemetry.io/otel/sdk/trace`. (#5997)
+- Fix attribute value truncation in `go.opentelemetry.io/otel/sdk/log`. (#6032)
 
 <!-- Released section -->
 <!-- Don't change this section unless doing release -->

@@ -406,7 +406,7 @@ func (r *Record) applyValueLimits(val log.Value) log.Value {
 	case log.KindString:
 		s := val.AsString()
 		if len(s) > r.attributeValueLengthLimit {
-			val = log.StringValue(truncate(s, r.attributeValueLengthLimit))
+			val = log.StringValue(truncate(r.attributeValueLengthLimit, s))
 		}
 	case log.KindSlice:
 		sl := val.AsSlice()
@@ -427,40 +427,78 @@ func (r *Record) applyValueLimits(val log.Value) log.Value {
 	return val
 }
 
-// truncate returns a copy of str truncated to have a length of at most n
-// characters. If the length of str is less than n, str itself is returned.
+// truncate returns a truncated version of s such that it contains less than
+// the limit number of characters. Truncation is applied by returning the limit
+// number of valid characters contained in s.
 //
-// The truncate of str ensures that no valid UTF-8 code point is split. The
-// copy returned will be less than n if a characters straddles the length
-// limit.
+// If limit is negative, it returns the original string.
 //
-// No truncation is performed if n is less than zero.
-func truncate(str string, n int) string {
-	if n < 0 {
-		return str
+// UTF-8 is supported. When truncating, all invalid characters are dropped
+// before applying truncation.
+//
+// If s already contains less than the limit number of bytes, it is returned
+// unchanged. No invalid characters are removed.
+func truncate(limit int, s string) string {
+	// This prioritize performance in the following order based on the most
+	// common expected use-cases.
+	//
+	//  - Short values less than the default limit (128).
+	//  - Strings with valid encodings that exceed the limit.
+	//  - No limit.
+	//  - Strings with invalid encodings that exceed the limit.
+	if limit < 0 || len(s) <= limit {
+		return s
 	}
 
-	// cut returns a copy of the s truncated to not exceed a length of n. If
-	// invalid UTF-8 is encountered, s is returned with false. Otherwise, the
-	// truncated copy will be returned with true.
-	cut := func(s string) (string, bool) {
-		var i int
-		for i = 0; i < n; {
-			r, size := utf8.DecodeRuneInString(s[i:])
-			if r == utf8.RuneError {
-				return s, false
+	// Optimistically, assume all valid UTF-8.
+	var b strings.Builder
+	count := 0
+	for i, c := range s {
+		if c != utf8.RuneError {
+			count++
+			if count > limit {
+				return s[:i]
 			}
-			if i+size > n {
-				break
-			}
-			i += size
+			continue
+		}
+
+		_, size := utf8.DecodeRuneInString(s[i:])
+		if size == 1 {
+			// Invalid encoding.
+			b.Grow(len(s) - 1)
+			_, _ = b.WriteString(s[:i])
+			s = s[i:]
+			break
 		}
-		return s[:i], true
 	}
 
-	cp, ok := cut(str)
-	if !ok {
-		cp, _ = cut(strings.ToValidUTF8(str, ""))
+	// Fast-path, no invalid input.
+	if b.Cap() == 0 {
+		return s
 	}
-	return cp
+
+	// Truncate while validating UTF-8.
+	for i := 0; i < len(s) && count < limit; {
+		c := s[i]
+		if c < utf8.RuneSelf {
+			// Optimization for single byte runes (common case).
+			_ = b.WriteByte(c)
+			i++
+			count++
+			continue
+		}
+
+		_, size := utf8.DecodeRuneInString(s[i:])
+		if size == 1 {
+			// We checked for all 1-byte runes above, this is a RuneError.
+			i++
+			continue
+		}
+
+		_, _ = b.WriteString(s[i : i+size])
+		i += size
+		count++
+	}
+
+	return b.String()
 }
@@ -570,74 +570,147 @@ func assertKV(t *testing.T, r Record, kv log.KeyValue) {
 }
 
 func TestTruncate(t *testing.T) {
-	testcases := []struct {
-		input, want string
-		limit       int
+	type group struct {
+		limit    int
+		input    string
+		expected string
+	}
+
+	tests := []struct {
+		name   string
+		groups []group
 	}{
+		// Edge case: limit is negative, no truncation should occur
 		{
-			input: "value",
-			want:  "value",
-			limit: -1,
-		},
-		{
-			input: "value",
-			want:  "",
-			limit: 0,
-		},
-		{
-			input: "value",
-			want:  "v",
-			limit: 1,
-		},
-		{
-			input: "value",
-			want:  "va",
-			limit: 2,
+			name: "NoTruncation",
+			groups: []group{
+				{-1, "No truncation!", "No truncation!"},
+			},
 		},
+
+		// Edge case: string is already shorter than the limit, no truncation
+		// should occur
 		{
-			input: "value",
-			want:  "val",
-			limit: 3,
+			name: "ShortText",
+			groups: []group{
+				{10, "Short text", "Short text"},
+				{15, "Short text", "Short text"},
+				{100, "Short text", "Short text"},
+			},
 		},
+
+		// Edge case: truncation happens with ASCII characters only
 		{
-			input: "value",
-			want:  "valu",
-			limit: 4,
+			name: "ASCIIOnly",
+			groups: []group{
+				{1, "Hello World!", "H"},
+				{5, "Hello World!", "Hello"},
+				{12, "Hello World!", "Hello World!"},
+			},
 		},
+
+		// Truncation including multi-byte characters (UTF-8)
 		{
-			input: "value",
-			want:  "value",
-			limit: 5,
+			name: "ValidUTF-8",
+			groups: []group{
+				{7, "Hello, 世界", "Hello, "},
+				{8, "Hello, 世界", "Hello, 世"},
+				{2, "こんにちは", "こん"},
+				{3, "こんにちは", "こんに"},
+				{5, "こんにちは", "こんにちは"},
+				{12, "こんにちは", "こんにちは"},
+			},
 		},
+
+		// Truncation with invalid UTF-8 characters
 		{
-			input: "value",
-			want:  "value",
-			limit: 6,
+			name: "InvalidUTF-8",
+			groups: []group{
+				{11, "Invalid\x80text", "Invalidtext"},
+				// Do not modify invalid text if equal to limit.
+				{11, "Valid text\x80", "Valid text\x80"},
+				// Do not modify invalid text if under limit.
+				{15, "Valid text\x80", "Valid text\x80"},
+				{5, "Hello\x80World", "Hello"},
+				{11, "Hello\x80World\x80!", "HelloWorld!"},
+				{15, "Hello\x80World\x80Test", "HelloWorldTest"},
+				{15, "Hello\x80\x80\x80World\x80Test", "HelloWorldTest"},
+				{15, "\x80\x80\x80Hello\x80\x80\x80World\x80Test\x80\x80", "HelloWorldTest"},
+			},
 		},
+
+		// Truncation with mixed validn and invalid UTF-8 characters
 		{
-			input: "€€€€", // 3 bytes each
-			want:  "€€€",
-			limit: 10,
+			name: "MixedUTF-8",
+			groups: []group{
+				{6, "€"[0:2] + "hello€€", "hello€"},
+				{6, "€" + "€"[0:2] + "hello", "€hello"},
+				{11, "Valid text\x80📜", "Valid text📜"},
+				{11, "Valid text📜\x80", "Valid text📜"},
+				{14, "😊 Hello\x80World🌍🚀", "😊 HelloWorld🌍🚀"},
+				{14, "😊\x80 Hello\x80World🌍🚀", "😊 HelloWorld🌍🚀"},
+				{14, "😊\x80 Hello\x80World🌍\x80🚀", "😊 HelloWorld🌍🚀"},
+				{14, "😊\x80 Hello\x80World🌍\x80🚀\x80", "😊 HelloWorld🌍🚀"},
+				{14, "\x80😊\x80 Hello\x80World🌍\x80🚀\x80", "😊 HelloWorld🌍🚀"},
+			},
 		},
+
+		// Edge case: empty string, should return empty string
 		{
-			input: "€"[0:2] + "hello€€", // corrupted first rune, then over limit
-			want:  "hello€",
-			limit: 10,
+			name: "Empty",
+			groups: []group{
+				{5, "", ""},
+			},
 		},
+
+		// Edge case: limit is 0, should return an empty string
 		{
-			input: "€"[0:2] + "hello", // corrupted first rune, then not over limit
-			want:  "hello",
-			limit: 10,
+			name: "Zero",
+			groups: []group{
+				{0, "Some text", ""},
+				{0, "", ""},
+			},
 		},
 	}
 
-	for _, tc := range testcases {
-		name := fmt.Sprintf("%s/%d", tc.input, tc.limit)
-		t.Run(name, func(t *testing.T) {
-			t.Log(tc.input, len(tc.input), tc.limit)
-			assert.Equal(t, tc.want, truncate(tc.input, tc.limit))
-		})
+	for _, tt := range tests {
+		for _, g := range tt.groups {
+			t.Run(tt.name, func(t *testing.T) {
+				t.Parallel()
+
+				got := truncate(g.limit, g.input)
+				assert.Equalf(
+					t, g.expected, got,
+					"input: %q([]rune%v))\ngot: %q([]rune%v)\nwant %q([]rune%v)",
+					g.input, []rune(g.input),
+					got, []rune(got),
+					g.expected, []rune(g.expected),
+				)
+			})
+		}
+	}
+}
+
+func BenchmarkTruncate(b *testing.B) {
+	run := func(limit int, input string) func(b *testing.B) {
+		return func(b *testing.B) {
+			b.ReportAllocs()
+			b.RunParallel(func(pb *testing.PB) {
+				var out string
+				for pb.Next() {
+					out = truncate(limit, input)
+				}
+				_ = out
+			})
+		}
 	}
+	b.Run("Unlimited", run(-1, "hello 😊 world 🌍🚀"))
+	b.Run("Zero", run(0, "Some text"))
+	b.Run("Short", run(10, "Short Text"))
+	b.Run("ASCII", run(5, "Hello, World!"))
+	b.Run("ValidUTF-8", run(10, "hello 😊 world 🌍🚀"))
+	b.Run("InvalidUTF-8", run(6, "€"[0:2]+"hello€€"))
+	b.Run("MixedUTF-8", run(14, "\x80😊\x80 Hello\x80World🌍\x80🚀\x80"))
 }
 
 func BenchmarkWalkAttributes(b *testing.B) {