Skip to content

Commit

Permalink
Merge branch 'master' of github.com:skypilot-org/skypilot into fix-co…
Browse files Browse the repository at this point in the history
…ntroller-logging
  • Loading branch information
Michaelvll committed May 11, 2023
2 parents b65f9a1 + bbdd5b1 commit bc67043
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 30 deletions.
30 changes: 13 additions & 17 deletions sky/authentication.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import socket
import subprocess
import sys
import textwrap
import time
from typing import Any, Dict, Tuple
import uuid
Expand All @@ -15,6 +14,7 @@
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.primitives.asymmetric import rsa
from cryptography.hazmat.backends import default_backend
import yaml

from sky import clouds
from sky import sky_logging
Expand Down Expand Up @@ -88,26 +88,22 @@ def get_or_generate_keys() -> Tuple[str, str]:
return private_key_path, public_key_path


def _replace_cloud_init_ssh_info_in_config(config: Dict[str, Any],
public_key: str) -> Dict[str, Any]:
config_str = common_utils.dump_yaml_str(config)
config_str = config_str.replace('skypilot:ssh_user',
config['auth']['ssh_user'])
config_str = config_str.replace('skypilot:ssh_public_key_content',
public_key)
config = yaml.safe_load(config_str)
return config


def setup_aws_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
_, public_key_path = get_or_generate_keys()
with open(public_key_path, 'r') as f:
public_key = f.read()
# Use cloud init in UserData to set up the authorized_keys to get
# around the number of keys limit and permission issues with
# ec2.describe_key_pairs.
# Note that sudo and shell need to be specified to ensure setup works.
# Reference: https://cloudinit.readthedocs.io/en/latest/reference/modules.html#users-and-groups # pylint: disable=line-too-long
for node_type in config['available_node_types']:
config['available_node_types'][node_type]['node_config']['UserData'] = (
textwrap.dedent(f"""\
#cloud-config
users:
- name: {config['auth']['ssh_user']}
shell: /bin/bash
sudo: ALL=(ALL) NOPASSWD:ALL
ssh-authorized-keys:
- {public_key}
"""))
config = _replace_cloud_init_ssh_info_in_config(config, public_key)
return config


Expand Down
5 changes: 5 additions & 0 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,13 @@
# it's possible to failover to 1b, which leaves a leaked instance in 1a. Here,
# we use the new yaml's zone field, which is guaranteed to be the existing zone
# '1a'.
# - UserData: The UserData field of the old yaml may be outdated, and we want to
# use the new yaml's UserData field, which contains the authorized key setup as
# well as the disabling of the auto-update with apt-get.
_RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [
('provider', 'availability_zone'),
('available_node_types', 'ray.head.default', 'node_config', 'UserData'),
('available_node_types', 'ray.worker.default', 'node_config', 'UserData'),
]


Expand Down
2 changes: 1 addition & 1 deletion sky/skylet/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def _stop_cluster(self, autostop_config):
# by the env vars). See #1880 for details.
env = dict(os.environ, RAY_USAGE_STATS_ENABLED='0')
env.pop('AWS_ACCESS_KEY_ID', None)
env.pop('AWS_SECRETE_ACCESS_KEY', None)
env.pop('AWS_SECRET_ACCESS_KEY', None)

# We do "initial ray up + ray down --workers-only" only for
# multinode clusters as they are not needed for single-node.
Expand Down
44 changes: 33 additions & 11 deletions sky/templates/aws-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,26 @@ available_node_types:
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
{% endif %}
# Use cloud init in UserData to set up the authorized_keys to get
# around the number of keys limit and permission issues with
# ec2.describe_key_pairs.
# Note that sudo and shell need to be specified to ensure setup works.
# Reference: https://cloudinit.readthedocs.io/en/latest/reference/modules.html#users-and-groups
# The bootcmd is to disable automatic APT updates, to avoid the lock
# when user call `apt install` on the node.
# Reference: https://unix.stackexchange.com/a/471192
UserData: |
#cloud-config
users:
- name: skypilot:ssh_user
shell: /bin/bash
sudo: ALL=(ALL) NOPASSWD:ALL
ssh_authorized_keys:
- skypilot:ssh_public_key_content
bootcmd:
- echo 'APT::Periodic::Enable "0";' > /etc/apt/apt.conf.d/10cloudinit-disable
- apt-get -y purge update-notifier-common ubuntu-release-upgrader-core landscape-common unattended-upgrades
- echo "Removed APT" | systemd-cat
TagSpecifications:
- ResourceType: instance
Tags:
Expand Down Expand Up @@ -88,6 +108,18 @@ available_node_types:
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
{% endif %}
UserData: |
#cloud-config
users:
- name: skypilot:ssh_user
shell: /bin/bash
sudo: ALL=(ALL) NOPASSWD:ALL
ssh_authorized_keys:
- skypilot:ssh_public_key_content
bootcmd:
- echo 'APT::Periodic::Enable "0";' > /etc/apt/apt.conf.d/10cloudinit-disable
- apt-get -y purge update-notifier-common ubuntu-release-upgrader-core landscape-common unattended-upgrades
- echo "Removed APT" | systemd-cat
TagSpecifications:
- ResourceType: instance
Tags:
Expand Down Expand Up @@ -127,17 +159,7 @@ setup_commands:
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
# Line 'mkdir -p ..': disable host key check
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
sudo systemctl stop unattended-upgrades || true;
sudo systemctl disable unattended-upgrades || true;
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
p=$(mylsof "/var/lib/dpkg/lock-frontend"); echo "$p";
sudo kill -9 `echo "$p" | tail -n 1` || true;
sudo rm /var/lib/dpkg/lock-frontend;
sudo pkill -9 dpkg;
sudo pkill -9 apt-get;
sudo dpkg --configure --force-overwrite -a;
mkdir -p ~/.ssh; touch ~/.ssh/config;
- mkdir -p ~/.ssh; touch ~/.ssh/config;
pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
(type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc;
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
Expand Down
2 changes: 1 addition & 1 deletion tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -1375,7 +1375,7 @@ def test_spot_failed_setup(generic_cloud: str):
'spot-failed-setup',
[
f'sky spot launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
'sleep 300',
'sleep 330',
# Make sure the job failed quickly.
f'{_SPOT_QUEUE_WAIT} | grep {name} | head -n1 | grep "FAILED_SETUP"',
],
Expand Down

0 comments on commit bc67043

Please sign in to comment.