From 6a742ffb16da991a0bf2df37218dc40af5ee0011 Mon Sep 17 00:00:00 2001 From: shollyman Date: Tue, 14 Jun 2022 12:01:56 -0700 Subject: [PATCH] feat(bigquery/storage/managedwriter): improve proto3 normalization (#6082) * feat(bigquery/storage/managedwriter): improve proto3 normalization This PR does mildly alarming things in the service of enabling proto3 support for users. NormalizeDescriptor will now handle altering a proto3 descriptor to an equivalent behavior using proto2. In the case of proto3 where field presence isn't used (e.g. no optional keyword), normalization uses the ability in proto2 to set optional defaults to mimic the proto3 behavior where scalar types are the default type value (0 for numerics, empty string for strings/bytes, and first defined enum value for enums). This doesn't, however, completely enable proto3 support. We still need to enable the backend to deal properly with the well-known wrapper types, which is also work in flight with the backend. --- .../managedwriter/adapt/protoconversion.go | 33 ++++ .../adapt/protoconversion_test.go | 183 ++++++++++++++++-- .../storage/managedwriter/validation_test.go | 46 +++-- 3 files changed, 223 insertions(+), 39 deletions(-) diff --git a/bigquery/storage/managedwriter/adapt/protoconversion.go b/bigquery/storage/managedwriter/adapt/protoconversion.go index ede2fa851109..50817d7651a8 100644 --- a/bigquery/storage/managedwriter/adapt/protoconversion.go +++ b/bigquery/storage/managedwriter/adapt/protoconversion.go @@ -293,6 +293,9 @@ func tableFieldSchemaToFieldDescriptorProto(field *storagepb.TableFieldSchema, i // In addition to nesting messages, this method also handles some encapsulation of enum types to avoid possible // conflicts due to ambiguities, and clears oneof indices as oneof isn't a concept that maps into BigQuery // schemas. +// +// To enable proto3 usage, this function will also rewrite proto3 descriptors into equivalent proto2 form. +// Such rewrites include setting the appropriate default values for proto3 fields. func NormalizeDescriptor(in protoreflect.MessageDescriptor) (*descriptorpb.DescriptorProto, error) { return normalizeDescriptorInternal(in, newStringSet(), newStringSet(), newStringSet(), nil) } @@ -311,6 +314,36 @@ func normalizeDescriptorInternal(in protoreflect.MessageDescriptor, visitedTypes for i := 0; i < in.Fields().Len(); i++ { inField := in.Fields().Get(i) resultFDP := protodesc.ToFieldDescriptorProto(inField) + // For proto3 messages without presence, use proto2 default values to match proto3 + // behavior in default values. + if inField.Syntax() == protoreflect.Proto3 && inField.Cardinality() != protoreflect.Repeated { + // Only set default value if there's no field presence. + if resultFDP.Proto3Optional == nil || !resultFDP.GetProto3Optional() { + switch resultFDP.GetType() { + case descriptorpb.FieldDescriptorProto_TYPE_BOOL: + resultFDP.DefaultValue = proto.String("false") + case descriptorpb.FieldDescriptorProto_TYPE_BYTES, descriptorpb.FieldDescriptorProto_TYPE_STRING: + resultFDP.DefaultValue = proto.String("") + case descriptorpb.FieldDescriptorProto_TYPE_ENUM: + // Resolve the proto3 default value. The default value should be the value name. + defValue := inField.Enum().Values().ByNumber(inField.Default().Enum()) + resultFDP.DefaultValue = proto.String(string(defValue.Name())) + case descriptorpb.FieldDescriptorProto_TYPE_DOUBLE, + descriptorpb.FieldDescriptorProto_TYPE_FLOAT, + descriptorpb.FieldDescriptorProto_TYPE_INT64, + descriptorpb.FieldDescriptorProto_TYPE_UINT64, + descriptorpb.FieldDescriptorProto_TYPE_INT32, + descriptorpb.FieldDescriptorProto_TYPE_FIXED64, + descriptorpb.FieldDescriptorProto_TYPE_FIXED32, + descriptorpb.FieldDescriptorProto_TYPE_UINT32, + descriptorpb.FieldDescriptorProto_TYPE_SFIXED32, + descriptorpb.FieldDescriptorProto_TYPE_SFIXED64, + descriptorpb.FieldDescriptorProto_TYPE_SINT32, + descriptorpb.FieldDescriptorProto_TYPE_SINT64: + resultFDP.DefaultValue = proto.String("0") + } + } + } // Clear proto3 optional annotation, as the backend converter can // treat this as a proto2 optional. if resultFDP.Proto3Optional != nil { diff --git a/bigquery/storage/managedwriter/adapt/protoconversion_test.go b/bigquery/storage/managedwriter/adapt/protoconversion_test.go index 1e5c3d647fa0..6737fa62e52a 100644 --- a/bigquery/storage/managedwriter/adapt/protoconversion_test.go +++ b/bigquery/storage/managedwriter/adapt/protoconversion_test.go @@ -568,11 +568,12 @@ func TestNormalizeDescriptor(t *testing.T) { Name: proto.String("google_protobuf_Int64Value"), Field: []*descriptorpb.FieldDescriptorProto{ { - Name: proto.String("value"), - JsonName: proto.String("value"), - Number: proto.Int32(1), - Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(), - Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + Name: proto.String("value"), + JsonName: proto.String("value"), + Number: proto.Int32(1), + DefaultValue: proto.String("0"), + Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), }, }, }, @@ -580,11 +581,12 @@ func TestNormalizeDescriptor(t *testing.T) { Name: proto.String("google_protobuf_StringValue"), Field: []*descriptorpb.FieldDescriptorProto{ { - Name: proto.String("value"), - JsonName: proto.String("value"), - Number: proto.Int32(1), - Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(), - Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + Name: proto.String("value"), + JsonName: proto.String("value"), + Number: proto.Int32(1), + DefaultValue: proto.String(""), + Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), }, }, }, @@ -628,11 +630,12 @@ func TestNormalizeDescriptor(t *testing.T) { Name: proto.String("testdata_SimpleMessageProto3WithOptional"), Field: []*descriptorpb.FieldDescriptorProto{ { - Name: proto.String("name"), - JsonName: proto.String("name"), - Number: proto.Int32(1), - Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(), - Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + Name: proto.String("name"), + JsonName: proto.String("name"), + Number: proto.Int32(1), + DefaultValue: proto.String(""), + Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), }, { Name: proto.String("value"), @@ -644,6 +647,156 @@ func TestNormalizeDescriptor(t *testing.T) { }, }, }, + { + description: "WithProto3Defaults", + in: (&testdata.ValidationP3Defaults{}).ProtoReflect().Descriptor(), + want: &descriptorpb.DescriptorProto{ + Name: proto.String("testdata_ValidationP3Defaults"), + Field: []*descriptorpb.FieldDescriptorProto{ + { + Name: proto.String("double_field"), + JsonName: proto.String("doubleField"), + Number: proto.Int32(1), + Type: descriptorpb.FieldDescriptorProto_TYPE_DOUBLE.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("0"), + }, + { + Name: proto.String("float_field"), + JsonName: proto.String("floatField"), + Number: proto.Int32(2), + Type: descriptorpb.FieldDescriptorProto_TYPE_FLOAT.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("0"), + }, + { + Name: proto.String("int32_field"), + JsonName: proto.String("int32Field"), + Number: proto.Int32(3), + Type: descriptorpb.FieldDescriptorProto_TYPE_INT32.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("0"), + }, + { + Name: proto.String("int64_field"), + JsonName: proto.String("int64Field"), + Number: proto.Int32(4), + Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("0"), + }, + { + Name: proto.String("uint32_field"), + JsonName: proto.String("uint32Field"), + Number: proto.Int32(5), + Type: descriptorpb.FieldDescriptorProto_TYPE_UINT32.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("0"), + }, + { + Name: proto.String("sint32_field"), + JsonName: proto.String("sint32Field"), + Number: proto.Int32(7), + Type: descriptorpb.FieldDescriptorProto_TYPE_SINT32.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("0"), + }, + { + Name: proto.String("sint64_field"), + JsonName: proto.String("sint64Field"), + Number: proto.Int32(8), + Type: descriptorpb.FieldDescriptorProto_TYPE_SINT64.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("0"), + }, + { + Name: proto.String("fixed32_field"), + JsonName: proto.String("fixed32Field"), + Number: proto.Int32(9), + Type: descriptorpb.FieldDescriptorProto_TYPE_FIXED32.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("0"), + }, + { + Name: proto.String("sfixed32_field"), + JsonName: proto.String("sfixed32Field"), + Number: proto.Int32(11), + Type: descriptorpb.FieldDescriptorProto_TYPE_SFIXED32.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("0"), + }, + { + Name: proto.String("sfixed64_field"), + JsonName: proto.String("sfixed64Field"), + Number: proto.Int32(12), + Type: descriptorpb.FieldDescriptorProto_TYPE_SFIXED64.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("0"), + }, + { + Name: proto.String("bool_field"), + JsonName: proto.String("boolField"), + Number: proto.Int32(13), + Type: descriptorpb.FieldDescriptorProto_TYPE_BOOL.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("false"), + }, + { + Name: proto.String("string_field"), + JsonName: proto.String("stringField"), + Number: proto.Int32(14), + Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String(""), + }, + { + Name: proto.String("bytes_field"), + JsonName: proto.String("bytesField"), + Number: proto.Int32(15), + Type: descriptorpb.FieldDescriptorProto_TYPE_BYTES.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String(""), + }, + { + Name: proto.String("enum_field"), + JsonName: proto.String("enumField"), + Number: proto.Int32(16), + Type: descriptorpb.FieldDescriptorProto_TYPE_ENUM.Enum(), + TypeName: proto.String("testdata_Proto3ExampleEnum_E.Proto3ExampleEnum"), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(), + DefaultValue: proto.String("P3_UNDEFINED"), + }, + }, + NestedType: []*descriptorpb.DescriptorProto{ + { + Name: proto.String("testdata_Proto3ExampleEnum_E"), + EnumType: []*descriptorpb.EnumDescriptorProto{ + { + Name: proto.String("Proto3ExampleEnum"), + Value: []*descriptorpb.EnumValueDescriptorProto{ + { + Name: proto.String("P3_UNDEFINED"), + Number: proto.Int32(0), + }, + { + Name: proto.String("P3_THING"), + Number: proto.Int32(1), + }, + { + Name: proto.String("P3_OTHER_THING"), + Number: proto.Int32(2), + }, + { + Name: proto.String("P3_THIRD_THING"), + Number: proto.Int32(3), + }, + }, + }, + }, + }, + }, + }, + }, { description: "WithExternalEnum", in: (&testdata.ExternalEnumMessage{}).ProtoReflect().Descriptor(), diff --git a/bigquery/storage/managedwriter/validation_test.go b/bigquery/storage/managedwriter/validation_test.go index c1958916997e..640e57be4b83 100644 --- a/bigquery/storage/managedwriter/validation_test.go +++ b/bigquery/storage/managedwriter/validation_test.go @@ -151,31 +151,29 @@ func TestValidation_Values(t *testing.T) { withIntegerValueCount("enum_field", int64(testdata.Proto2ExampleEnum_P2_OTHER_THING), 1), }, }, - /* - BACKEND BEHAVIOR CURRENTLY INCORRECT for proto3 non-presence defaults. - { - description: "proto3 default values", - tableSchema: testdata.ValidationBaseSchema, - inputRow: &testdata.ValidationP3Defaults{}, - constraints: []constraintOption{ - withExactRowCount(1), - withFloatValueCount("double_field", 0, 1), - withFloatValueCount("float_field", 0, 1), - withIntegerValueCount("int32_field", 0, 1), - withIntegerValueCount("int64_field", 0, 1), - withIntegerValueCount("uint32_field", 0, 1), - withIntegerValueCount("sint32_field", 0, 1), - withIntegerValueCount("sint64_field", 0, 1), - withIntegerValueCount("fixed32_field", 0, 1), - withIntegerValueCount("sfixed32_field", 0, 1), - withIntegerValueCount("sfixed64_field", 0, 1), - withBoolValueCount("bool_field", false, 1), - withStringValueCount("string_field", "", 1), - withBytesValueCount("bytes_field", []byte(""), 1), - withIntegerValueCount("enum_field", int64(0), 1), - }, + + { + description: "proto3 default values", + tableSchema: testdata.ValidationBaseSchema, + inputRow: &testdata.ValidationP3Defaults{}, + constraints: []constraintOption{ + withExactRowCount(1), + withFloatValueCount("double_field", 0, 1), + withFloatValueCount("float_field", 0, 1), + withIntegerValueCount("int32_field", 0, 1), + withIntegerValueCount("int64_field", 0, 1), + withIntegerValueCount("uint32_field", 0, 1), + withIntegerValueCount("sint32_field", 0, 1), + withIntegerValueCount("sint64_field", 0, 1), + withIntegerValueCount("fixed32_field", 0, 1), + withIntegerValueCount("sfixed32_field", 0, 1), + withIntegerValueCount("sfixed64_field", 0, 1), + withBoolValueCount("bool_field", false, 1), + withStringValueCount("string_field", "", 1), + withBytesValueCount("bytes_field", []byte(""), 1), + withIntegerValueCount("enum_field", int64(0), 1), }, - */ + }, /* BACKEND BEHAVIOR FOR WRAPPER TYPES CURRENTLY INCORRECT {