Skip to content

Commit

Permalink
Support deep seek distill model (#50)
Browse files Browse the repository at this point in the history
* support deepseek distilled model
  • Loading branch information
xiaoxshe authored Jan 31, 2025
1 parent d7cbc77 commit a15395f
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 4 deletions.
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,37 @@ hyperpod start-job --job-name <job-name> [--namespace <namespace>] [--job-kind <
* `persistent-volume-claims` (list[string]) - Optional. The pre-created persistent volume claims (PVCs) that the data scientist can choose to mount to the containers. The cluster admin users should create PVCs and provide it to the data scientist users.
* `results-dir` (string) - Optional. The location to store the results, checkpoints, and logs. The cluster admin users should set this up and provide it to the data scientist users. The default value is `./results`.
* `service-account-name` - Optional. The Kubernetes service account that allows Pods to access resources based on the permissions granted to that service account. The cluster admin users should create the Kubernetes service account.
* `recipe` (string) - Optional. The recipe to use for the job. The recipe is a predefined set of parameters for the job.
* `override-parameters` (string) - Optional. The parameters to override for the job. The parameters are in JSON format.
Example:
```
hyperpod start-job --recipe <recipe-name>
```
Below is an example of how to use the `override-parameters` option and deepseek recipe.
```
hyperpod start-job --recipe fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning --override-parameters \
'{
"cluster":"k8s",
"cluster_type":"k8s",
"container":"658645717510.dkr.ecr.us-west-2.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121",
"+cluster.persistent_volume_claims.0.claimName":"fsx-claim-large",
"+cluster.persistent_volume_claims.0.mountPath":"data",
"cluster.service_account_name":"",
"recipes.run.name":"deepseek",
"recipes.model.train_batch_size":"1",
"instance_type":"p4d.24xlarge",
"recipes.model.data.use_synthetic_data":"True",
"recipes.model.fp8":"False",
"recipes.exp_manager.auto_checkpoint.enabled":"False",
"recipes.exp_manager.export_full_model.save_last":"False",
"recipes.exp_manager.checkpoint_callback_params.save_last":"False",
"recipes.model.hf_model_name_or_path":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"recipes.model.hf_access_token":"<your-access-token>",
"recipes.exp_manager.exp_dir":""
}'\
```
### Getting Job Details
Expand Down
15 changes: 12 additions & 3 deletions src/hyperpod_cli/commands/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,14 @@ def cancel_job(
fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning \n
fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora \n
fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning \n
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_fine_tuning \n
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning \n
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_lora \n
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_lora \n
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_fine_tuning \n
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_fine_tuning \n
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_lora \n
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_lora \n
"""
)
@click.option(
Expand Down Expand Up @@ -776,7 +784,7 @@ def start_job(
max_retry=max_retry,
deep_health_check_passed_nodes_only=deep_health_check_passed_nodes_only,
)

# TODO: Unblock this after fixing customer using EKS cluster.
console_link = utils.get_cluster_console_url()
print(json.dumps({"Console URL": console_link}, indent=1, sort_keys=False))

Expand Down Expand Up @@ -808,8 +816,9 @@ def patch_job(patch_type: str, job_name: str, namespace: Optional[str]):
group=KUEUE_CUSTOM_OBJECT_GROUP,
resource=WORKLOAD_CUSTOM_OBJECT_PLURAL,
)
namespace = DiscoverNamespaces().discover_accessible_namespace(resource_attributes_template)

# TODO: Unblock this after better customer onboarding experience for Crescendo.
#namespace = DiscoverNamespaces().discover_accessible_namespace(resource_attributes_template)
namespace = "default"

patch_type_enum = JobPatchType(patch_type)
k8s_client = KubernetesClient()
Expand Down
2 changes: 1 addition & 1 deletion src/hyperpod_cli/sagemaker_hyperpod_recipes
Submodule sagemaker_hyperpod_recipes updated 37 files
+9 −1 README.md
+1 −0 launcher/nemo/constants.py
+28 −0 launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq16k_gpu_fine_tuning.sh
+28 −0 launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq16k_gpu_lora.sh
+28 −0 launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq8k_gpu_fine_tuning.sh
+28 −0 launcher_scripts/deepseek/run_hf_deepseek_r1_llama_70b_seq8k_gpu_lora.sh
+27 −0 launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq16k_gpu_fine_tuning.sh
+28 −0 launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq16k_gpu_lora.sh
+28 −0 launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq8k_gpu_fine_tuning.sh
+28 −0 launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq8k_gpu_lora.sh
+1 −1 launcher_scripts/llama/run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh
+1 −1 launcher_scripts/llama/run_hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.sh
+1 −1 launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1_fine_tuning.sh
+146 −0 recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_fine_tuning.yaml
+144 −0 recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_lora.yaml
+146 −0 recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_fine_tuning.yaml
+144 −0 recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_lora.yaml
+146 −0 recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_fine_tuning.yaml
+144 −0 recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_lora.yaml
+146 −0 recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning.yaml
+144 −0 recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_lora.yaml
+1 −1 recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x16_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x32_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x16_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x32_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.yaml
+1 −1 recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.yaml
+34 −0 tests/test_recipes.py
+18 −0 tests/test_utils.py

0 comments on commit a15395f

Please sign in to comment.