diff --git a/src/google/protobuf/compiler/parser_unittest.cc b/src/google/protobuf/compiler/parser_unittest.cc index ddf34bfa4523..cc6f1efb218d 100644 --- a/src/google/protobuf/compiler/parser_unittest.cc +++ b/src/google/protobuf/compiler/parser_unittest.cc @@ -229,6 +229,32 @@ TEST_F(ParserTest, WarnIfSyntaxIdentifierOmmitted) { typedef ParserTest ParseMessageTest; +TEST_F(ParseMessageTest, IgnoreBOM) { + char input[] = " message TestMessage {\n" + " required int32 foo = 1;\n" + "}\n"; + // Set UTF-8 BOM. + input[0] = (char)0xEF; + input[1] = (char)0xBB; + input[2] = (char)0xBF; + ExpectParsesTo(input, + "message_type {" + " name: \"TestMessage\"" + " field { name:\"foo\" label:LABEL_REQUIRED type:TYPE_INT32 number:1 }" + "}"); +} + +TEST_F(ParseMessageTest, BOMError) { + char input[] = " message TestMessage {\n" + " required int32 foo = 1;\n" + "}\n"; + input[0] = (char)0xEF; + ExpectHasErrors(input, + "0:1: Proto file starts with 0xEF but not UTF-8 BOM. " + "Only UTF-8 is accepted for proto file.\n" + "0:0: Expected top-level statement (e.g. \"message\").\n"); +} + TEST_F(ParseMessageTest, SimpleMessage) { ExpectParsesTo( "message TestMessage {\n" diff --git a/src/google/protobuf/io/tokenizer.cc b/src/google/protobuf/io/tokenizer.cc index ef2de300bfd8..60bd7957d4a1 100644 --- a/src/google/protobuf/io/tokenizer.cc +++ b/src/google/protobuf/io/tokenizer.cc @@ -762,6 +762,15 @@ bool Tokenizer::NextWithComments(string* prev_trailing_comments, next_leading_comments); if (current_.type == TYPE_START) { + // Ignore unicode byte order mark(BOM) if it appears at the file + // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted. + if (TryConsume((char)0xEF)) { + if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) { + AddError("Proto file starts with 0xEF but not UTF-8 BOM. " + "Only UTF-8 is accepted for proto file."); + return false; + } + } collector.DetachFromPrev(); } else { // A comment appearing on the same line must be attached to the previous