feat: [dlp] Introduce Discovery API protos and methods (#4761)

* feat: Introduce Discovery API protos and methods Add DeidentifyDataSource result summary protos Add protos for nullness and uniqueness, and column data profiles Add SensitivityScore proto to InfoType docs: Update comments for many messages. PiperOrigin-RevId: 576598642 Source-Link: googleapis/googleapis@24813ac Source-Link: googleapis/googleapis-gen@8528bf8 Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLXByaXZhY3ktZGxwLy5Pd2xCb3QueWFtbCIsImgiOiI4NTI4YmY4ZDc2MGM5MjUxYzIxMjI4MWJlMjFiZDAwZjE0NGM5YTY1In0= * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Denis DelGrosso <[email protected]>
googleapis · Oct 26, 2023 · 692e10f · 692e10f
1 parent ea49e4e
commit 692e10f
Show file tree

Hide file tree

Showing 33 changed files with 26,316 additions and 9,509 deletions.
diff --git a/packages/google-privacy-dlp/README.md b/packages/google-privacy-dlp/README.md
diff --git a/packages/google-privacy-dlp/protos/google/privacy/dlp/v2/dlp.proto b/packages/google-privacy-dlp/protos/google/privacy/dlp/v2/dlp.proto
diff --git a/packages/google-privacy-dlp/protos/google/privacy/dlp/v2/storage.proto b/packages/google-privacy-dlp/protos/google/privacy/dlp/v2/storage.proto
@@ -1,4 +1,4 @@
-// Copyright 2022 Google LLC
+// Copyright 2023 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -38,51 +38,73 @@ message InfoType {
 
   // Optional version name for this InfoType.
   string version = 2;
+
+  // Optional custom sensitivity for this InfoType.
+  // This only applies to data profiling.
+  SensitivityScore sensitivity_score = 3;
 }
 
-// Score is a summary of all elements in the data profile.
-// A higher number means more sensitive.
+// Score is calculated from of all elements in the data profile.
+// A higher level means the data is more sensitive.
 message SensitivityScore {
-  // Various score levels for resources.
+  // Various sensitivity score levels for resources.
   enum SensitivityScoreLevel {
     // Unused.
     SENSITIVITY_SCORE_UNSPECIFIED = 0;
 
-    // No sensitive information detected. Limited access.
+    // No sensitive information detected. The resource isn't publicly
+    // accessible.
     SENSITIVITY_LOW = 10;
 
-    // Medium risk - PII, potentially sensitive data, or fields with free-text
-    // data that are at higher risk of having intermittent sensitive data.
-    // Consider limiting access.
+    // Medium risk. Contains personally identifiable information (PII),
+    // potentially sensitive data, or fields with free-text data that are at a
+    // higher risk of having intermittent sensitive data. Consider limiting
+    // access.
     SENSITIVITY_MODERATE = 20;
 
-    // High risk – SPII may be present. Exfiltration of data may lead to user
-    // data loss. Re-identification of users may be possible. Consider limiting
-    // usage and or removing SPII.
+    // High risk. Sensitive personally identifiable information (SPII) can be
+    // present. Exfiltration of data can lead to user data loss.
+    // Re-identification of users might be possible. Consider limiting usage and
+    // or removing SPII.
     SENSITIVITY_HIGH = 30;
   }
 
-  // The score applied to the resource.
+  // The sensitivity score applied to the resource.
   SensitivityScoreLevel score = 1;
 }
 
-// Categorization of results based on how likely they are to represent a match,
-// based on the number of elements they contain which imply a match.
+// Coarse-grained confidence level of how well a particular finding
+// satisfies the criteria to match a particular infoType.
+//
+// Likelihood is calculated based on the number of signals a
+// finding has that implies that the finding matches the infoType. For
+// example, a string that has an '@' and a '.com' is more likely to be a
+// match for an email address than a string that only has an '@'.
+//
+// In general, the highest likelihood level has the strongest signals that
+// indicate a match. That is, a finding with a high likelihood has a low chance
+// of being a false positive.
+//
+// For more information about each likelihood level
+// and how likelihood works, see [Match
+// likelihood](https://cloud.google.com/dlp/docs/likelihood).
 enum Likelihood {
   // Default value; same as POSSIBLE.
   LIKELIHOOD_UNSPECIFIED = 0;
 
-  // Few matching elements.
+  // Highest chance of a false positive.
   VERY_UNLIKELY = 1;
 
+  // High chance of a false positive.
   UNLIKELY = 2;
 
-  // Some matching elements.
+  // Some matching signals. The default value.
   POSSIBLE = 3;
 
+  // Low chance of a false positive.
   LIKELY = 4;
 
-  // Many matching elements.
+  // Confidence level is high. Lowest chance of a false positive.
   VERY_LIKELY = 5;
 }
 
@@ -163,9 +185,7 @@ message CustomInfoType {
   // output. This should be used in conjunction with a field on the
   // transformation such as `surrogate_info_type`. This CustomInfoType does
   // not support the use of `detection_rules`.
-  message SurrogateType {
-
-  }
+  message SurrogateType {}
 
   // Deprecated; use `InspectionRuleSet` instead. Rule for modifying a
   // `CustomInfoType` to alter behavior under certain circumstances, depending
@@ -282,6 +302,13 @@ message CustomInfoType {
   // If set to EXCLUSION_TYPE_EXCLUDE this infoType will not cause a finding
   // to be returned. It still can be used for rules matching.
   ExclusionType exclusion_type = 8;
+
+  // Sensitivity for this CustomInfoType. If this CustomInfoType extends an
+  // existing InfoType, the sensitivity here will take precedence over that of
+  // the original InfoType. If unset for a CustomInfoType, it will default to
+  // HIGH.
+  // This only applies to data profiling.
+  SensitivityScore sensitivity_score = 9;
 }
 
 // General identifier of a data field in a storage service.
@@ -330,7 +357,7 @@ enum FileType {
   // scanning attempts to convert the content of the file to utf_8 to scan
   // the file.
   // If you wish to avoid this fall back, specify one or more of the other
-  // FileType's in your storage scan.
+  // file types in your storage scan.
   BINARY_FILE = 1;
 
   // Included file extensions:
@@ -343,19 +370,24 @@ enum FileType {
   TEXT_FILE = 2;
 
   // Included file extensions:
-  //   bmp, gif, jpg, jpeg, jpe, png.
-  // bytes_limit_per_file has no effect on image files.
-  // Image inspection is restricted to 'global', 'us', 'asia', and 'europe'.
+  //   bmp, gif, jpg, jpeg, jpe, png. Setting
+  // [bytes_limit_per_file][google.privacy.dlp.v2.CloudStorageOptions.bytes_limit_per_file]
+  // or
+  // [bytes_limit_per_file_percent][google.privacy.dlp.v2.CloudStorageOptions.bytes_limit_per_file]
+  // has no effect on image files. Image inspection is restricted to the
+  // `global`, `us`, `asia`, and `europe` regions.
   IMAGE = 3;
 
-  // Word files >30 MB will be scanned as binary files.
+  // Microsoft Word files larger than 30 MB will be scanned as binary files.
   // Included file extensions:
-  //   docx, dotx, docm, dotm
+  //   docx, dotx, docm, dotm. Setting `bytes_limit_per_file` or
+  //   `bytes_limit_per_file_percent` has no effect on Word files.
   WORD = 5;
 
-  // PDF files >30 MB will be scanned as binary files.
+  // PDF files larger than 30 MB will be scanned as binary files.
   // Included file extensions:
-  //   pdf
+  //   pdf. Setting `bytes_limit_per_file` or `bytes_limit_per_file_percent`
+  // has no effect on PDF files.
   PDF = 6;
 
   // Included file extensions:
@@ -370,14 +402,16 @@ enum FileType {
   //   tsv
   TSV = 9;
 
-  // Powerpoint files >30 MB will be scanned as binary files.
-  // Included file extensions:
-  //   pptx, pptm, potx, potm, pot
+  // Microsoft PowerPoint files larger than 30 MB will be scanned as binary
+  // files. Included file extensions:
+  //   pptx, pptm, potx, potm, pot. Setting `bytes_limit_per_file` or
+  //   `bytes_limit_per_file_percent` has no effect on PowerPoint files.
   POWERPOINT = 11;
 
-  // Excel files >30 MB will be scanned as binary files.
+  // Microsoft Excel files larger than 30 MB will be scanned as binary files.
   // Included file extensions:
-  //   xlsx, xlsm, xltx, xltm
+  //   xlsx, xlsm, xltx, xltm. Setting `bytes_limit_per_file` or
+  //   `bytes_limit_per_file_percent` has no effect on Excel files.
   EXCEL = 12;
 }
 
@@ -478,16 +512,22 @@ message CloudStorageOptions {
   FileSet file_set = 1;
 
   // Max number of bytes to scan from a file. If a scanned file's size is bigger
-  // than this value then the rest of the bytes are omitted. Only one
-  // of bytes_limit_per_file and bytes_limit_per_file_percent can be specified.
-  // Cannot be set if de-identification is requested.
+  // than this value then the rest of the bytes are omitted. Only one of
+  // `bytes_limit_per_file` and `bytes_limit_per_file_percent` can be specified.
+  // This field can't be set if de-identification is requested. For certain file
+  // types, setting this field has no effect. For more information, see [Limits
+  // on bytes scanned per
+  // file](https://cloud.google.com/dlp/docs/supported-file-types#max-byte-size-per-file).
   int64 bytes_limit_per_file = 4;
 
   // Max percentage of bytes to scan from a file. The rest are omitted. The
   // number of bytes scanned is rounded down. Must be between 0 and 100,
-  // inclusively. Both 0 and 100 means no limit. Defaults to 0. Only one
-  // of bytes_limit_per_file and bytes_limit_per_file_percent can be specified.
-  // Cannot be set if de-identification is requested.
+  // inclusively. Both 0 and 100 means no limit. Defaults to 0. Only one of
+  // bytes_limit_per_file and bytes_limit_per_file_percent can be specified.
+  // This field can't be set if de-identification is requested. For certain file
+  // types, setting this field has no effect. For more information, see [Limits
+  // on bytes scanned per
+  // file](https://cloud.google.com/dlp/docs/supported-file-types#max-byte-size-per-file).
   int32 bytes_limit_per_file_percent = 8;
 
   // List of file type groups to include in the scan.
@@ -565,9 +605,15 @@ message BigQueryOptions {
 
   // References to fields excluded from scanning. This allows you to skip
   // inspection of entire columns which you know have no findings.
+  // When inspecting a table, we recommend that you inspect all columns.
+  // Otherwise, findings might be affected because hints from excluded columns
+  // will not be used.
   repeated FieldId excluded_fields = 5;
 
   // Limit scanning only to these fields.
+  // When inspecting a table, we recommend that you inspect all columns.
+  // Otherwise, findings might be affected because hints from excluded columns
+  // will not be used.
   repeated FieldId included_fields = 7;
 }