scrap->validate_tokenizer. Remove profiler stuff. Remove unnecessary …

…fields from config files
determined-ai · Feb 21, 2024 · f700eda · f700eda
1 parent 7696c20
commit f700eda
Show file tree

Hide file tree

Showing 5 changed files with 6 additions and 548 deletions.
diff --git a/blog/llm-finetuning/deepspeed.yaml b/blog/llm-finetuning/deepspeed.yaml
@@ -1,19 +1,11 @@
-name: mistral deepspeed easy WarmupDecayLR batch size 1 bf16, genai image
+name: mistral deepspeed easy
 debug: false
-workspace: agnieszka
-project: llm-blog2
-profiling:
-  enabled: true
-  begin_on_batch: 100
-  end_after_batch: 1000
 environment:
   environment_variables:
     - NCCL_DEBUG=INFO
-    - HF_HOME=/nvmefs1/agnieszka.ciborowska/hf_cache
   image: determinedai/genai-train:latest
 resources:
   slots_per_trial: 2
-  resource_pool: A100
 searcher:
   name: single
   max_length:
@@ -22,13 +14,12 @@ searcher:
   smaller_is_better: false
 hyperparameters:
   model: "mistralai/Mistral-7B-Instruct-v0.2"
-  #model: "TinyLlama/TinyLlama-1.1B-Chat-v0.4"
   dataset_subset: "easy"
   lora: false
   training_args:
     output_dir: "/tmp/llm_finetuning"
     max_steps: 5000
-    per_device_train_batch_size: 1
+    per_device_train_batch_size: 2
     per_device_eval_batch_size: 4
     bf16: true
     evaluation_strategy: "steps"
@@ -41,5 +32,5 @@ hyperparameters:
     deepspeed: "ds_configs/ds_config_stage_3.json"
 entrypoint: >-
   python -m determined.launch.deepspeed
-  python finetune_with_profiling.py
+  python finetune.py
 max_restarts: 0
diff --git a/blog/llm-finetuning/distributed.yaml b/blog/llm-finetuning/distributed.yaml
@@ -1,19 +1,11 @@
-name: mistral easy medium bf16
+name: mistral lora easy
 debug: false
-workspace: agnieszka
-project: llm-blog2
-profiling:
-  enabled: true
-  begin_on_batch: 100
-  end_after_batch: 1000
 environment:
   environment_variables:
     - NCCL_DEBUG=INFO
-    - HF_HOME=/nvmefs1/agnieszka.ciborowska/hf_cache
   image: determinedai/environments-dev:python-3.10-pytorch-2.0-deepspeed-0.10.0-smartsim
 resources:
   slots_per_trial: 2
-  resource_pool: A100
 searcher:
   name: single
   max_length:
@@ -22,13 +14,12 @@ searcher:
   smaller_is_better: false
 hyperparameters:
   model: "mistralai/Mistral-7B-Instruct-v0.2"
-  #model: "TinyLlama/TinyLlama-1.1B-Chat-v0.4"
   dataset_subset: "easy"
   lora: true
   training_args:
     output_dir: "/tmp/llm_finetuning"
     max_steps: 5000
-    per_device_train_batch_size: 1
+    per_device_train_batch_size: 8
     per_device_eval_batch_size: 4
     bf16: true
     evaluation_strategy: "steps"
@@ -40,5 +31,5 @@ hyperparameters:
     learning_rate: 1e-5
 entrypoint: >-
   python -m determined.launch.torch_distributed
-  python finetune_with_profiling.py
+  python finetune.py
 max_restarts: 0
diff --git a/blog/llm-finetuning/finetune_with_profiling.py b/blog/llm-finetuning/finetune_with_profiling.py