diff --git a/README.md b/README.md index 98d76d3..d4e6379 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,37 @@ hyperpod start-job --job-name [--namespace ] [--job-kind < * `persistent-volume-claims` (list[string]) - Optional. The pre-created persistent volume claims (PVCs) that the data scientist can choose to mount to the containers. The cluster admin users should create PVCs and provide it to the data scientist users. * `results-dir` (string) - Optional. The location to store the results, checkpoints, and logs. The cluster admin users should set this up and provide it to the data scientist users. The default value is `./results`. * `service-account-name` - Optional. The Kubernetes service account that allows Pods to access resources based on the permissions granted to that service account. The cluster admin users should create the Kubernetes service account. +* `recipe` (string) - Optional. The recipe to use for the job. The recipe is a predefined set of parameters for the job. +* `override-parameters` (string) - Optional. The parameters to override for the job. The parameters are in JSON format. +Example: +``` +hyperpod start-job --recipe +``` + +Below is an example of how to use the `override-parameters` option and deepseek recipe. + +``` +hyperpod start-job --recipe fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning --override-parameters \ +'{ + "cluster":"k8s", + "cluster_type":"k8s", + "container":"658645717510.dkr.ecr.us-west-2.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121", + "+cluster.persistent_volume_claims.0.claimName":"fsx-claim-large", + "+cluster.persistent_volume_claims.0.mountPath":"data", + "cluster.service_account_name":"", + "recipes.run.name":"deepseek", + "recipes.model.train_batch_size":"1", + "instance_type":"p4d.24xlarge", + "recipes.model.data.use_synthetic_data":"True", + "recipes.model.fp8":"False", + "recipes.exp_manager.auto_checkpoint.enabled":"False", + "recipes.exp_manager.export_full_model.save_last":"False", + "recipes.exp_manager.checkpoint_callback_params.save_last":"False", + "recipes.model.hf_model_name_or_path":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "recipes.model.hf_access_token":"", + "recipes.exp_manager.exp_dir":"" +}'\ +``` ### Getting Job Details diff --git a/src/hyperpod_cli/commands/job.py b/src/hyperpod_cli/commands/job.py index 1d867d2..8732f2c 100644 --- a/src/hyperpod_cli/commands/job.py +++ b/src/hyperpod_cli/commands/job.py @@ -473,6 +473,14 @@ def cancel_job( fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning \n fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora \n fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning \n +fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_fine_tuning \n +fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning \n +fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_lora \n +fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_lora \n +fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_fine_tuning \n +fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_fine_tuning \n +fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_lora \n +fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_lora \n """ ) @click.option( @@ -776,7 +784,7 @@ def start_job( max_retry=max_retry, deep_health_check_passed_nodes_only=deep_health_check_passed_nodes_only, ) - + # TODO: Unblock this after fixing customer using EKS cluster. console_link = utils.get_cluster_console_url() print(json.dumps({"Console URL": console_link}, indent=1, sort_keys=False)) @@ -808,8 +816,9 @@ def patch_job(patch_type: str, job_name: str, namespace: Optional[str]): group=KUEUE_CUSTOM_OBJECT_GROUP, resource=WORKLOAD_CUSTOM_OBJECT_PLURAL, ) - namespace = DiscoverNamespaces().discover_accessible_namespace(resource_attributes_template) - + # TODO: Unblock this after better customer onboarding experience for Crescendo. + #namespace = DiscoverNamespaces().discover_accessible_namespace(resource_attributes_template) + namespace = "default" patch_type_enum = JobPatchType(patch_type) k8s_client = KubernetesClient() diff --git a/src/hyperpod_cli/sagemaker_hyperpod_recipes b/src/hyperpod_cli/sagemaker_hyperpod_recipes index 66e49e0..6bd77d3 160000 --- a/src/hyperpod_cli/sagemaker_hyperpod_recipes +++ b/src/hyperpod_cli/sagemaker_hyperpod_recipes @@ -1 +1 @@ -Subproject commit 66e49e0a86bc3602ae8db5ea8f01e249328475b6 +Subproject commit 6bd77d3b0917bbed1a311e8f7fafa2cdce45b10e