-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add default config for Gretel GPT with differential privacy (#433)
* Create natural-language-differential-privacy.yml * Update natural-language-differential-privacy.yml * Update natural-language-differential-privacy.yml * Update natural-language-differential-privacy.yml
- Loading branch information
Showing
1 changed file
with
40 additions
and
0 deletions.
There are no files selected for viewing
40 changes: 40 additions & 0 deletions
40
config_templates/gretel/synthetics/natural-language-differential-privacy.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# Natural Language Differentially Private Fine-Tuning Configuration | ||
# | ||
# Purpose: This configuration is designed for fine-tuning language models on | ||
# natural language data (e.g., reviews, tweets, conversations) using | ||
# differential privacy (DP) techniques in Gretel. | ||
# | ||
# Key Points: | ||
# 1. Data and Epochs: For optimal accuracy and utility, it's recommended to use | ||
# 10,000 or more examples when performing DP fine-tuning. A minimum of | ||
# 3 epochs is recommended regardless of dataset size. | ||
# 2. Gradient Accumulation: This technique allows for larger effective batch sizes | ||
# by accumulating gradients over multiple forward/backward passes. In this config, | ||
# the effective batch size is 128 (batch_size * gradient_accumulation_steps). | ||
|
||
schema_version: "1.0" | ||
name: "natural-language-dp" | ||
models: | ||
- gpt_x: | ||
data_source: "__temp__" | ||
pretrained_model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | ||
column_name: null # Specify column name for data if using multiple columns | ||
params: | ||
batch_size: 16 # Number of samples used to compute gradient | ||
gradient_accumulation_steps: 8 # Number of steps to accumulate gradients before updating | ||
epochs: 3 # Number of times to iterate over the entire dataset | ||
weight_decay: 0.01 | ||
warmup_steps: 100 | ||
lr_scheduler: "linear" | ||
learning_rate: 0.001 # Initial learning rate for training | ||
max_tokens: 128 # Increase to allow for longer sequences | ||
peft_params: | ||
lora_r: 8 | ||
lora_alpha_over_r: 1 | ||
privacy_params: | ||
dp: true # Enable differentially private fine-tuning via DP-SGD | ||
epsilon: 5 # Privacy budget (lower values = stronger privacy) | ||
delta: auto # Probability of privacy leakage (auto-calculated) | ||
generate: | ||
num_records: 80 # Number of records to generate | ||
maximum_text_length: 128 # Maximum length of generated texts in tokens |