From f3507f0bdb9d871e121d65c5eb7c3c071fd0ed24 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 29 Oct 2022 02:43:00 -0700 Subject: [PATCH 1/2] Fix skylet checking --- sky/templates/aws-ray.yml.j2 | 2 +- sky/templates/azure-ray.yml.j2 | 2 +- sky/templates/gcp-ray.yml.j2 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 5732d61dc45..32ed0323ecc 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -128,7 +128,7 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - ((ps aux | grep "-m sky.skylet.skylet" | grep -q python3) || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); + - ((ps aux | grep -v nohup | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); ray stop; ray start --disable-usage-stats --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 88f442a4d69..c222e29b7ce 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -124,7 +124,7 @@ setup_commands: head_start_ray_commands: # Start skylet daemon. (Should not place it in the head_setup_commands, otherwise it will run before skypilot is installed.) # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. - - ((ps aux | grep "-m sky.skylet.skylet" | grep -q python3) || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); + - ((ps aux | grep -v nohup | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); ray stop; ray start --disable-usage-stats --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 892b4986b1c..4452bde2a4e 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -162,7 +162,7 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - ((ps aux | grep "-m sky.skylet.skylet" | grep -q python3) || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); + - ((ps aux | grep -v nohup | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); export SKY_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKY_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l); ray stop; ray start --disable-usage-stats --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKY_NUM_GPUS || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; From 62ae7bbe360ecc581a210175e27dc46f82ffd265 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 29 Oct 2022 02:46:13 -0700 Subject: [PATCH 2/2] exclude grep --- sky/templates/aws-ray.yml.j2 | 2 +- sky/templates/azure-ray.yml.j2 | 2 +- sky/templates/gcp-ray.yml.j2 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 32ed0323ecc..4ed178b4497 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -128,7 +128,7 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - ((ps aux | grep -v nohup | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); + - ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); ray stop; ray start --disable-usage-stats --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index c222e29b7ce..66c97cebf39 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -124,7 +124,7 @@ setup_commands: head_start_ray_commands: # Start skylet daemon. (Should not place it in the head_setup_commands, otherwise it will run before skypilot is installed.) # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. - - ((ps aux | grep -v nohup | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); + - ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); ray stop; ray start --disable-usage-stats --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 4452bde2a4e..a0964d8c274 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -162,7 +162,7 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - ((ps aux | grep -v nohup | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); + - ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); export SKY_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKY_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l); ray stop; ray start --disable-usage-stats --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKY_NUM_GPUS || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;