Skip to content

Commit

Permalink
[Azure] Add num_gpus for ray status (#3313)
Browse files Browse the repository at this point in the history
* Add num_gpus for ray status

* fix type
  • Loading branch information
Michaelvll authored Mar 14, 2024
1 parent ed5bb75 commit 498d02c
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 2 deletions.
3 changes: 3 additions & 0 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,10 @@ def make_deploy_resources_variables(
r = resources
# r.accelerators is cleared but .instance_type encodes the info.
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
acc_count = None
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
acc_count = str(sum(acc_dict.values()))
else:
custom_resources = None
# pylint: disable=import-outside-toplevel
Expand Down Expand Up @@ -319,6 +321,7 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
return {
'instance_type': r.instance_type,
'custom_resources': custom_resources,
'num_gpus': acc_count,
'use_spot': r.use_spot,
'region': region_name,
# Azure does not support specific zones.
Expand Down
4 changes: 2 additions & 2 deletions sky/templates/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,14 @@ setup_commands:
# current num items (num SSH connections): 2
head_start_ray_commands:
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
- ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
- ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{{dump_port_command}};
{{ray_head_wait_initialized_command}}

{%- if num_nodes > 1 %}
worker_start_ray_commands:
- ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
- ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{%- else %}
worker_start_ray_commands: []
Expand Down

0 comments on commit 498d02c

Please sign in to comment.