Add default config for Gretel GPT with differential privacy (#433) (#434

) * Create natural-language-differential-privacy.yml * Update natural-language-differential-privacy.yml * Update natural-language-differential-privacy.yml * Update natural-language-differential-privacy.yml Co-authored-by: Alexander Watson <[email protected]>
gretelai · Jul 22, 2024 · 54eef18 · 54eef18
1 parent 31756df
commit 54eef18
Showing 1 changed file with 40 additions and 0 deletions.
diff --git a/config_templates/gretel/synthetics/natural-language-differential-privacy.yml b/config_templates/gretel/synthetics/natural-language-differential-privacy.yml
@@ -0,0 +1,40 @@
+# Natural Language Differentially Private Fine-Tuning Configuration
+# 
+# Purpose: This configuration is designed for fine-tuning language models on 
+# natural language data (e.g., reviews, tweets, conversations) using 
+# differential privacy (DP) techniques in Gretel.
+#
+# Key Points:
+# 1. Data and Epochs: For optimal accuracy and utility, it's recommended to use 
+#    10,000 or more examples when performing DP fine-tuning. A minimum of 
+#    3 epochs is recommended regardless of dataset size. 
+# 2. Gradient Accumulation: This technique allows for larger effective batch sizes
+#    by accumulating gradients over multiple forward/backward passes. In this config,
+#    the effective batch size is 128 (batch_size * gradient_accumulation_steps).
+
+schema_version: "1.0"
+name: "natural-language-dp"
+models:
+  - gpt_x:
+      data_source: "__temp__" 
+      pretrained_model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+      column_name: null  # Specify column name for data if using multiple columns
+      params:
+        batch_size: 16  # Number of samples used to compute gradient
+        gradient_accumulation_steps: 8  # Number of steps to accumulate gradients before updating
+        epochs: 3  # Number of times to iterate over the entire dataset
+        weight_decay: 0.01
+        warmup_steps: 100
+        lr_scheduler: "linear"
+        learning_rate: 0.001 # Initial learning rate for training
+        max_tokens: 128  # Increase to allow for longer sequences
+      peft_params:
+        lora_r: 8
+        lora_alpha_over_r: 1
+      privacy_params:
+        dp: true  # Enable differentially private fine-tuning via DP-SGD
+        epsilon: 5  # Privacy budget (lower values = stronger privacy)
+        delta: auto  # Probability of privacy leakage (auto-calculated)
+      generate:
+        num_records: 80  # Number of records to generate
+        maximum_text_length: 128  # Maximum length of generated texts in tokens