Skip to content

Commit

Permalink
proto: reject invalid UTF-8 in strings (protocolbuffers#499)
Browse files Browse the repository at this point in the history
The proto2 and proto3 specifications explicitly say that strings must be
composed of valid UTF-8 characters. 

The C++ implementation prints an error anytime strings with invalid UTF-8
is present during parsing or serializing. For proto3, it explicitly
errors out when parsing an invalid string.

Thus, we cause marshal/unmarshal to error if any string fields are invalid.
The error returned is fail-fast and indistinguishable. This means that the
we stop the unmarshal process the moment we detect an invalid string,
as opposed to finishing the unmarshal. An indistinguishable error means that
we provide no API for the user to programmatically check specifically for
invalid UTF-8 errors so that they can ignore it.

In conversations with the protobuf team, they felt strongly that there should be
no ability for users to ignore the UTF-8 validation error. However, if this change is
overly problematic for users, we can consider a workaround for users already
depending on strings containing invalid UTF-8.
  • Loading branch information
dsnet authored Jan 26, 2018
1 parent 10c2d9d commit 3525335
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 0 deletions.
15 changes: 15 additions & 0 deletions proto/all_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2184,6 +2184,21 @@ func TestConcurrentMarshal(t *testing.T) {
}
}

func TestInvalidUTF8(t *testing.T) {
const wire = "\x12\x04\xde\xea\xca\xfe"

var m GoTest
if err := Unmarshal([]byte(wire), &m); err == nil {
t.Errorf("Unmarshal error: got nil, want non-nil")
}

m.Reset()
m.Table = String(wire[2:])
if _, err := Marshal(&m); err == nil {
t.Errorf("Marshal error: got nil, want non-nil")
}
}

// Benchmarks

func testMsg() *GoTest {
Expand Down
3 changes: 3 additions & 0 deletions proto/lib.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ package proto

import (
"encoding/json"
"errors"
"fmt"
"log"
"reflect"
Expand All @@ -279,6 +280,8 @@ import (
// This variable is temporary and will go away soon. Do not rely on it.
var Proto3UnknownFields = false

var errInvalidUTF8 = errors.New("proto: invalid UTF-8 string")

// Message is implemented by generated protocol buffer messages.
type Message interface {
Reset()
Expand Down
13 changes: 13 additions & 0 deletions proto/table_marshal.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
"strings"
"sync"
"sync/atomic"
"unicode/utf8"
)

// a sizer takes a pointer to a field and the size of its tag, computes the size of
Expand Down Expand Up @@ -1983,6 +1984,9 @@ func appendBoolPackedSlice(b []byte, ptr pointer, wiretag uint64, _ bool) ([]byt
}
func appendStringValue(b []byte, ptr pointer, wiretag uint64, _ bool) ([]byte, error) {
v := *ptr.toString()
if !utf8.ValidString(v) {
return nil, errInvalidUTF8
}
b = appendVarint(b, wiretag)
b = appendVarint(b, uint64(len(v)))
b = append(b, v...)
Expand All @@ -1993,6 +1997,9 @@ func appendStringValueNoZero(b []byte, ptr pointer, wiretag uint64, _ bool) ([]b
if v == "" {
return b, nil
}
if !utf8.ValidString(v) {
return nil, errInvalidUTF8
}
b = appendVarint(b, wiretag)
b = appendVarint(b, uint64(len(v)))
b = append(b, v...)
Expand All @@ -2004,6 +2011,9 @@ func appendStringPtr(b []byte, ptr pointer, wiretag uint64, _ bool) ([]byte, err
return b, nil
}
v := *p
if !utf8.ValidString(v) {
return nil, errInvalidUTF8
}
b = appendVarint(b, wiretag)
b = appendVarint(b, uint64(len(v)))
b = append(b, v...)
Expand All @@ -2012,6 +2022,9 @@ func appendStringPtr(b []byte, ptr pointer, wiretag uint64, _ bool) ([]byte, err
func appendStringSlice(b []byte, ptr pointer, wiretag uint64, _ bool) ([]byte, error) {
s := *ptr.toStringSlice()
for _, v := range s {
if !utf8.ValidString(v) {
return nil, errInvalidUTF8
}
b = appendVarint(b, wiretag)
b = appendVarint(b, uint64(len(v)))
b = append(b, v...)
Expand Down
10 changes: 10 additions & 0 deletions proto/table_unmarshal.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
"strings"
"sync"
"sync/atomic"
"unicode/utf8"
)

// Unmarshal is the entry point from the generated .pb.go files.
Expand Down Expand Up @@ -1514,6 +1515,9 @@ func unmarshalStringValue(b []byte, f pointer, w int) ([]byte, error) {
return nil, io.ErrUnexpectedEOF
}
v := string(b[:x])
if !utf8.ValidString(v) {
return nil, errInvalidUTF8
}
*f.toString() = v
return b[x:], nil
}
Expand All @@ -1531,6 +1535,9 @@ func unmarshalStringPtr(b []byte, f pointer, w int) ([]byte, error) {
return nil, io.ErrUnexpectedEOF
}
v := string(b[:x])
if !utf8.ValidString(v) {
return nil, errInvalidUTF8
}
*f.toStringPtr() = &v
return b[x:], nil
}
Expand All @@ -1548,6 +1555,9 @@ func unmarshalStringSlice(b []byte, f pointer, w int) ([]byte, error) {
return nil, io.ErrUnexpectedEOF
}
v := string(b[:x])
if !utf8.ValidString(v) {
return nil, errInvalidUTF8
}
s := f.toStringSlice()
*s = append(*s, v)
return b[x:], nil
Expand Down

0 comments on commit 3525335

Please sign in to comment.