From 2b0fd487664722fdc02006012d42206ff32cb91e Mon Sep 17 00:00:00 2001 From: Avani-Thakker-Crest <129363704+Avani-Thakker-Crest@users.noreply.github.com> Date: Wed, 21 Jun 2023 20:14:13 +0530 Subject: [PATCH] [DLP] Implemented dlp_deidentify_time_extract sample (#10235) ## Description Implemented dlp_deidentify_time_extract sample. Reference: https://cloud.google.com/dlp/docs/transformations-reference#time-extract Fixes # Note: Before submitting a pull request, please open an issue for discussion if you are not associated with Google. ## Checklist - [x] I have followed [Sample Guidelines from AUTHORING_GUIDE.MD](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md) - [ ] README is updated to include [all relevant information](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md#readme-file) - [x] **Tests** pass: `nox -s py-3.9` (see [Test Environment Setup](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md#test-environment-setup)) - [x] **Lint** pass: `nox -s lint` (see [Test Environment Setup](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md#test-environment-setup)) - [ ] These samples need a new **API enabled** in testing projects to pass (let us know which ones) - [ ] These samples need a new/updated **env vars** in testing projects set to pass (let us know which ones) - [ ] This sample adds a new sample directory, and I updated the [CODEOWNERS file](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/.github/CODEOWNERS) with the codeowners for this sample - [ ] This sample adds a new **Product API**, and I updated the [Blunderbuss issue/PR auto-assigner](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/.github/blunderbuss.yml) with the codeowners for this sample - [x] Please **merge** this PR for me once it is approved --- dlp/snippets/deid.py | 162 ++++++++++++++++++++++++++++++++++++++ dlp/snippets/deid_test.py | 15 ++++ 2 files changed, 177 insertions(+) diff --git a/dlp/snippets/deid.py b/dlp/snippets/deid.py index e736fa7fdb4b..0413206e5c64 100644 --- a/dlp/snippets/deid.py +++ b/dlp/snippets/deid.py @@ -914,6 +914,137 @@ def write_data(data: types.storage.Value) -> str: # [END dlp_deidentify_date_shift] +# [START dlp_deidentify_time_extract] +import csv # noqa: F811, E402, I100 +from datetime import datetime # noqa: F811, E402, I100 +from typing import List # noqa: F811, E402 + +import google.cloud.dlp # noqa: F811, E402 + + +def deidentify_with_time_extract( + project: str, + date_fields: List[str], + input_csv_file: str, + output_csv_file: str, +) -> None: + """ Uses the Data Loss Prevention API to deidentify dates in a CSV file through + time part extraction. + Args: + project: The Google Cloud project id to use as a parent resource. + date_fields: A list of (date) fields in CSV file to de-identify + through time extraction. Example: ['birth_date', 'register_date']. + Date values in format: mm/DD/YYYY are considered as part of this + sample. + input_csv_file: The path to the CSV file to deidentify. The first row + of the file must specify column names, and all other rows must + contain valid values. + output_csv_file: The output file path to save the time extracted data. + """ + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert date field list to Protobuf type. + def map_fields(field): + return {"name": field} + + if date_fields: + date_fields = map(map_fields, date_fields) + else: + date_fields = [] + + csv_lines = [] + with open(input_csv_file) as csvfile: + reader = csv.reader(csvfile) + for row in reader: + csv_lines.append(row) + + # Helper function for converting CSV rows to Protobuf types + def map_headers(header): + return {"name": header} + + def map_data(value): + try: + date = datetime.strptime(value, "%m/%d/%Y") + return { + "date_value": { + "year": date.year, "month": date.month, "day": date.day + } + } + except ValueError: + return {"string_value": value} + + def map_rows(row): + return {"values": map(map_data, row)} + + # Using the helper functions, convert CSV rows to protobuf-compatible + # dictionaries. + csv_headers = map(map_headers, csv_lines[0]) + csv_rows = map(map_rows, csv_lines[1:]) + + # Construct the table dictionary. + table = {"headers": csv_headers, "rows": csv_rows} + + # Construct the `item` for table to de-identify. + item = {"table": table} + + # Construct deidentify configuration dictionary. + deidentify_config = { + "record_transformations": { + "field_transformations": [ + { + "primitive_transformation": { + "time_part_config": { + "part_to_extract": "YEAR" + } + }, + "fields": date_fields, + } + ] + } + } + + # Write to CSV helper methods. + def write_header(header): + return header.name + + def write_data(data): + return data.string_value or "{}/{}/{}".format( + data.date_value.month, + data.date_value.day, + data.date_value.year, + ) + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API + response = dlp.deidentify_content( + request={ + "parent": parent, + "deidentify_config": deidentify_config, + "item": item, + } + ) + + # Print the result. + print("Table after de-identification: {}".format(response.item.table)) + + # Write results to CSV file. + with open(output_csv_file, "w") as csvfile: + write_file = csv.writer(csvfile, delimiter=",") + write_file.writerow(map(write_header, response.item.table.headers)) + for row in response.item.table.rows: + write_file.writerow(map(write_data, row.values)) + + # Print status. + print(f"Successfully saved date-extracted output to {output_csv_file}") + + +# [END dlp_deidentify_time_extract] + + # [START dlp_deidentify_replace_infotype] from typing import List # noqa: F811, E402, I100 @@ -2124,6 +2255,30 @@ def deidentify_table_with_multiple_crypto_hash( "key_name.", ) + time_extract_parser = subparsers.add_parser( + "deid_time_extract", + help="Deidentify dates in a CSV file by extracting a date part.", + ) + time_extract_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + time_extract_parser.add_argument( + "input_csv_file", + help="The path to the CSV file to deidentify. The first row of the " + "file must specify column names, and all other rows must contain " + "valid values.", + ) + time_extract_parser.add_argument( + "date_fields", + nargs="+", + help="The list of date fields in the CSV file to de-identify. Example: " + "['birth_date', 'register_date']", + ) + time_extract_parser.add_argument( + "output_csv_file", help="The path to save the time-extracted data." + ) + replace_with_infotype_parser = subparsers.add_parser( "replace_with_infotype", help="Deidentify sensitive data in a string by replacing it with the " @@ -2485,6 +2640,13 @@ def deidentify_table_with_multiple_crypto_hash( wrapped_key=args.wrapped_key, key_name=args.key_name, ) + elif args.content == "deid_time_extract": + deidentify_with_time_extract( + args.project, + date_fields=args.date_fields, + input_csv_file=args.input_csv_file, + output_csv_file=args.output_csv_file, + ) elif args.content == "replace_with_infotype": deidentify_with_replace_infotype( args.project, diff --git a/dlp/snippets/deid_test.py b/dlp/snippets/deid_test.py index 86944d0dccce..5162fa35fd81 100644 --- a/dlp/snippets/deid_test.py +++ b/dlp/snippets/deid_test.py @@ -230,6 +230,21 @@ def test_deidentify_with_date_shift_using_context_field( assert "Successful" in out +def test_deidentify_with_time_extract(tempdir: TextIO, capsys: pytest.CaptureFixture) -> None: + output_filepath = os.path.join(str(tempdir), "year-extracted.csv") + + deid.deidentify_with_time_extract( + GCLOUD_PROJECT, + input_csv_file=CSV_FILE, + output_csv_file=output_filepath, + date_fields=DATE_FIELDS, + ) + + out, _ = capsys.readouterr() + + assert "Successful" in out + + def test_reidentify_with_fpe(capsys: pytest.CaptureFixture) -> None: labeled_fpe_string = "My SSN is SSN_TOKEN(9):731997681"