Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updates to transform blueprint (#467) - main #471

Merged
merged 1 commit into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 29 additions & 13 deletions config_templates/gretel/transform/default.yml
Original file line number Diff line number Diff line change
@@ -1,31 +1,47 @@
# Policy to search for and redact "sensitive PII" as defined by `ask-experian` as well as
# a custom defined regular expression for user IDs.
# https://www.experian.com/blogs/ask-experian/what-is-personally-identifiable-information/

schema_version: "1.0"
name: "redact-pii-nlp"
name: "redact-pii-ner"
models:
- transform_v2:
data_source: "_"
globals:
classify:
# Classification currently uses the Gretel Cloud. If you are running in a hybrid
# environment and prefer not to use the Cloud, please set "enable: false" below.
enable: true
entities:
- first_name
- last_name
- name
- gender
- email
- phone_number
- street_address
- city
- administrative_unit
- country
- address
- postcode
- credit_card_number
- ssn
ner_threshold: 0.2
locales: [en_US, en_CA]
ner:
ner_threshold: 0.7
locales: [en_US]
steps:
- rows:
- vars:
row_seed: random.random()
rows:
update:
- condition: column.entity is in globals.classify.entities
- condition: column.entity is in globals.classify.entities and column.entity != "country" and column.entity != "email" and column.entity != "phone_number" and column.entity != "first_name" and column.entity != "last_name" and column.entity != "gender"
value: column.entity | fake
fallback_value: '"<" ~ column.entity ~ ">"'
- type: text
value: this | fake_entities
- condition: column.entity == "phone_number"
value: (random.randint(100, 999) | string) + "-" + (random.randint(100, 999) |
string) + "-" + (random.randint(1000, 9999) | string)
- condition: column.entity == "first_name"
value: fake.persona(row_index=vars.row_seed + index).first_name
- condition: column.entity == "last_name"
value: fake.persona(row_index=vars.row_seed + index).last_name
- condition: column.entity == "gender"
value: fake.persona(row_index=vars.row_seed + index).gender
- condition: column.entity == "email"
value: fake.persona(row_index=vars.row_seed + index).email
- condition: column.entity is none and column.type == "text"
value: this | fake_entities
173 changes: 173 additions & 0 deletions config_templates/gretel/transform/transform_example.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
schema_version: "1.0"
name: "redact-pii-ner"
models:
- transform_v2:
data_source: "_"
globals:
classify:
enable: true
entities:
# The model has been fine-tuned on the entities
# listed below, but you can include any arbitrary
# value and the model will attempt to find it.
# See here for definitions of each entity:
# https://docs.gretel.ai/create-synthetic-data/models/transform/v2/supported-entities

# If you want to fake an entity,
# it must be included in Faker:
# https://faker.readthedocs.io/en/master/providers.html

# You generally want to keep the entity list
# to a minimum, only including entities that you
# need to transform, in order to avoid the model getting
# confused about which entity type a column may be.
# Comment entities in or out based on what exists
# in your dataset.

# If the names are combined into a single column
# for full name in your dataset, use the name entity
# instead of first_name and last_name.
- first_name
- last_name
# - name

# If the address is in a single column rather than
# separated out into street address, city, state, etc.,
# use only address as the entity instead,
# and comment the others out.
- street_address
- city
- administrative_unit # Faker's term for state or province
- country
- postcode
# - address

# Other common entities
- gender
- email
- phone_number
- credit_card_number
- ssn

# Entities that the model has been fine-tuned on,
# but are less common. Hence they have been commented
# out by default.
# - account_number
# - api_key
# - bank_routing_number
# - biometric_identifier
# - certificate_license_number
# - company_name
# - coordinate
# - customer_id
# - cvv
# - date
# - date_of_birth
# - date_time
# - device_identifier
# - employee_id
# - health_plan_beneficiary_number
# - ipv4
# - ipv6
# - license_plate
# - medical_record_number
# - national_id
# - password
# - pin
# - state
# - swift_bic
# - unique_identifier
# - tax_id
# - time
# - url
# - user_name
# - vehicle_identifier

ner:
# You can think of the NER threshold as the level of
# confidence required in the model's detection before
# labeling an entity. Increasing the NER threshold
# decreases the number of detected entities, while
# decreasing the NER threshold increases the number
# of detected entities.
ner_threshold: 0.7

# You can add additional locales to the list by separating
# via commas, such as locales: [en_US, en_CA]
locales: [en_US]
steps:
- rows:
update:
# For each column in the dataset you want to fake,
# follow this format:
# - name: <column_name>
# value: fake.<entity_type>()
- name: address
value: fake.street_address()
- name: city
value: fake.city()
- name: state
value: fake.administrative_unit()
- name: postcode
value: fake.postcode()

# Names can be faked the same way:
- name: fname
value: fake.first_name()
- name: lname
value: fake.last_name()
# - name: fullname
# value: fake.name()

# You may want names to be based on a gender column instead.
# Update the name of the gender column (e.g., "gender").
# Update the values in the gender column (e.g., "male", "female").
# - name: fname
# value: fake.first_name_male() if row["gender"] == 'male' else fake.first_name_female() if row["gender"] == 'female' else fake.first_name()
# - name: lname
# value: fake.last_name_male() if row["gender"] == 'male' else fake.last_name_female() if row["gender"] == 'female' else fake.last_name()
# Or, for full name:
# - name: name
# value: fake.name_male() if row["gender"] == 'male' else fake.name_female() if row["gender"] == 'female' else fake.name()

# You may have values based on others values in the
# dataset, such as email.
# Ensure steps for dependent values (e.g. email)
# are performed after steps that fake dependent values
# (e.g. first_name and last_name).
# For example, if I want email to be based on first
# and last name, I need to have faked those already.

# The below syntax generates an email of the form
# <lowercase_first_letter_of_first_name><lowercase_last_name><number between 0 and 9>@<freedomain>
# As an example, it could be "[email protected]" for someone with a faked name of Kara Johnson
# Be sure to update the column names with your column names,
# rather than "fname" and "lname"
- name: email
value: row["fname"][0].lower() + row["lname"].lower() + (random.randint(0, 9) | string) + "@" + fake.free_email_domain()

# This section of the Faker documentation has a list
# of various options for domains or full emails:
# https://faker.readthedocs.io/en/master/providers/faker.providers.internet.html
# Here are some examples:
# value: fake.email() # Note that this will not be based on first or last name columns, it is random.
# value: fake.company_email() # Note that this will not be based on first or last name columns, it is random.
# value: row["fname"] + "." + row["lname"] + "@" + fake.domainname()
# value: row["fname"] + "." + row["lname"] + "@" + fake.domainword() + ".com"
# The next example generates a fake company name, removes punctuation,
# and converts to lowercase for the names and domain.
# value: row["fname"].lower() + "." + row["lname"].lower() + "@" + fake.company().replace(" ", "").replace(",","").replace("-","").lower() + ".org"

# By default, Faker does not standardize telephone formats.
# This example generates a format like "123-456-7890".
- condition: column.entity == "phone_number"
value: (random.randint(100, 999) | string) + "-" + (random.randint(100, 999) | string) + "-" + (random.randint(1000, 9999) | string)
# The next example generates a format like "(123)456-7890"
# - condition: column.entity == "phone_number"
# value: "(" + (random.randint(100, 999) | string) + ")" + (random.randint(100, 999) | string) + "-" + (random.randint(1000, 9999) | string)

# The next section text columns not classified as a single entity and runs NER.
# It fakes any entities from the list on globals.classify.entities.
# Comment this out if you don't want to fake entities in free-text columns.
- condition: column.entity is none and column.type == "text"
value: this | fake_entities
42 changes: 29 additions & 13 deletions config_templates/gretel/transform/transform_v2.yml
Original file line number Diff line number Diff line change
@@ -1,31 +1,47 @@
# Policy to search for and redact "sensitive PII" as defined by `ask-experian` as well as
# a custom defined regular expression for user IDs.
# https://www.experian.com/blogs/ask-experian/what-is-personally-identifiable-information/

schema_version: "1.0"
name: "redact-pii-nlp"
name: "redact-pii-ner"
models:
- transform_v2:
data_source: "_"
globals:
classify:
# Classification currently uses the Gretel Cloud. If you are running in a hybrid
# environment and prefer not to use the Cloud, please set "enable: false" below.
enable: true
entities:
- first_name
- last_name
- name
- gender
- email
- phone_number
- street_address
- city
- administrative_unit
- country
- address
- postcode
- credit_card_number
- ssn
ner_threshold: 0.2
locales: [en_US, en_CA]
ner:
ner_threshold: 0.7
locales: [en_US]
steps:
- rows:
- vars:
row_seed: random.random()
rows:
update:
- condition: column.entity is in globals.classify.entities
- condition: column.entity is in globals.classify.entities and column.entity != "country" and column.entity != "email" and column.entity != "phone_number" and column.entity != "first_name" and column.entity != "last_name" and column.entity != "gender"
value: column.entity | fake
fallback_value: '"<" ~ column.entity ~ ">"'
- type: text
value: this | fake_entities
- condition: column.entity == "phone_number"
value: (random.randint(100, 999) | string) + "-" + (random.randint(100, 999) |
string) + "-" + (random.randint(1000, 9999) | string)
- condition: column.entity == "first_name"
value: fake.persona(row_index=vars.row_seed + index).first_name
- condition: column.entity == "last_name"
value: fake.persona(row_index=vars.row_seed + index).last_name
- condition: column.entity == "gender"
value: fake.persona(row_index=vars.row_seed + index).gender
- condition: column.entity == "email"
value: fake.persona(row_index=vars.row_seed + index).email
- condition: column.entity is none and column.type == "text"
value: this | fake_entities
8 changes: 4 additions & 4 deletions model_types/modelTypesList.json
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,12 @@
"description": "Flexible data pre and post processing toolkit including support for detecting arbitrary PII entities, configurable data generation templates, and faster speed.",
"label": "Transform V2",
"sampleDataset": {
"fileName": "patients.csv",
"fileName": "patients_notes.csv",
"description": "This patient dataset contains names, addresses and other personally identifiable information, which needs to be redacted before the dataset can be shared or used to train ML models.",
"records": 36,
"fields": 17,
"records": 35,
"fields": 19,
"trainingTime": "< 2 mins",
"bytes": 5647
"bytes": 20646
}
},
{
Expand Down
Loading
Loading