Support deep seek distill model (#50)

* support deepseek distilled model
aws · Jan 31, 2025 · a15395f · a15395f
1 parent d7cbc77
commit a15395f
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -161,6 +161,37 @@ hyperpod start-job --job-name <job-name> [--namespace <namespace>] [--job-kind <
 * `persistent-volume-claims` (list[string]) - Optional. The pre-created persistent volume claims (PVCs) that the data scientist can choose to mount to the containers. The cluster admin users should create PVCs and provide it to the data scientist users.
 * `results-dir` (string) - Optional. The location to store the results, checkpoints, and logs. The cluster admin users should set this up and provide it to the data scientist users. The default value is `./results`.
 * `service-account-name` - Optional. The Kubernetes service account that allows Pods to access resources based on the permissions granted to that service account. The cluster admin users should create the Kubernetes service account.
+* `recipe` (string) - Optional. The recipe to use for the job. The recipe is a predefined set of parameters for the job.
+* `override-parameters` (string) - Optional. The parameters to override for the job. The parameters are in JSON format.
+Example:
+```
+hyperpod start-job --recipe <recipe-name>
+```
+
+Below is an example of how to use the `override-parameters` option and deepseek recipe.
+
+```
+hyperpod start-job --recipe fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning --override-parameters \
+'{
+    "cluster":"k8s",
+    "cluster_type":"k8s",
+    "container":"658645717510.dkr.ecr.us-west-2.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121",
+    "+cluster.persistent_volume_claims.0.claimName":"fsx-claim-large",
+    "+cluster.persistent_volume_claims.0.mountPath":"data",
+    "cluster.service_account_name":"",
+    "recipes.run.name":"deepseek",
+    "recipes.model.train_batch_size":"1",
+    "instance_type":"p4d.24xlarge",
+    "recipes.model.data.use_synthetic_data":"True",
+    "recipes.model.fp8":"False",
+    "recipes.exp_manager.auto_checkpoint.enabled":"False",
+    "recipes.exp_manager.export_full_model.save_last":"False",
+    "recipes.exp_manager.checkpoint_callback_params.save_last":"False",
+    "recipes.model.hf_model_name_or_path":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "recipes.model.hf_access_token":"<your-access-token>",
+    "recipes.exp_manager.exp_dir":""   
+}'\
+```
 
 
 ### Getting Job Details

diff --git a/src/hyperpod_cli/commands/job.py b/src/hyperpod_cli/commands/job.py
@@ -473,6 +473,14 @@ def cancel_job(
 fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning \n
 fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora \n
 fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_fine_tuning \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_lora \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_lora \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_fine_tuning \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_fine_tuning \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_lora \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_lora \n
             """
 )
 @click.option(
@@ -776,7 +784,7 @@ def start_job(
         max_retry=max_retry,
         deep_health_check_passed_nodes_only=deep_health_check_passed_nodes_only,
     )
-
+    # TODO: Unblock this after fixing customer using EKS cluster.
     console_link = utils.get_cluster_console_url()
     print(json.dumps({"Console URL": console_link}, indent=1, sort_keys=False))
 
@@ -808,8 +816,9 @@ def patch_job(patch_type: str, job_name: str, namespace: Optional[str]):
             group=KUEUE_CUSTOM_OBJECT_GROUP,
             resource=WORKLOAD_CUSTOM_OBJECT_PLURAL,
         )
-        namespace = DiscoverNamespaces().discover_accessible_namespace(resource_attributes_template)
-
+        # TODO: Unblock this after better customer onboarding experience for Crescendo.
+        #namespace = DiscoverNamespaces().discover_accessible_namespace(resource_attributes_template)
+        namespace = "default"
 
     patch_type_enum = JobPatchType(patch_type)
     k8s_client = KubernetesClient()

diff --git a/src/hyperpod_cli/sagemaker_hyperpod_recipes b/src/hyperpod_cli/sagemaker_hyperpod_recipes
+9 −1		README.md
+1 −0		launcher/nemo/constants.py
+28 −0		launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq16k_gpu_fine_tuning.sh
+28 −0		launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq16k_gpu_lora.sh
+28 −0		launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq8k_gpu_fine_tuning.sh
+28 −0		launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq8k_gpu_lora.sh
+27 −0		launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq16k_gpu_fine_tuning.sh
+28 −0		launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq16k_gpu_lora.sh
+28 −0		launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq8k_gpu_fine_tuning.sh
+28 −0		launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq8k_gpu_lora.sh
+1 −1		launcher_scripts/llama/run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh
+1 −1		launcher_scripts/llama/run_hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.sh
+1 −1		launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1_fine_tuning.sh
+146 −0		recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_fine_tuning.yaml
+144 −0		recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_lora.yaml
+146 −0		recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_fine_tuning.yaml
+144 −0		recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_lora.yaml
+146 −0		recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_fine_tuning.yaml
+144 −0		recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_lora.yaml
+146 −0		recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning.yaml
+144 −0		recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_lora.yaml
+1 −1		recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x16_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x32_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x16_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x32_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.yaml
+1 −1		recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.yaml
+34 −0		tests/test_recipes.py
+18 −0		tests/test_utils.py