From ab9daf038828602f9f49545a06caac1611bc9cb0 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 21 Aug 2023 21:27:18 -0700 Subject: [PATCH] [AWS] unset AWS env vars to avoid autoscaler using the incorrect credentials (#2442) * unset AWS env vars to avoid autoscaler using the incorrect credentials * add comment * Update sky/templates/aws-ray.yml.j2 Co-authored-by: Zongheng Yang --------- Co-authored-by: Zongheng Yang --- sky/templates/aws-ray.yml.j2 | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 87ff6a7c48d..7f04c938215 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -225,16 +225,20 @@ setup_commands: # Increment the following for catching performance bugs easier: # current num items (num SSH connections): 1 head_start_ray_commands: + # Unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY to avoid using credentials from environment + # variables set by user. SkyPilot's ray cluster should use the `~/.aws/` credentials, as that is + # the one used to create the cluster, and the autoscaler module started by the `ray start` command + # should use the same credentials. Otherwise, `ray status` will fail to fetch the available nodes. Reference: https://github.com/skypilot-org/skypilot/issues/2441 # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - ray stop; unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {%- if num_nodes > 1 %} worker_start_ray_commands: - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - ray stop; unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: []