Skip to content

Commit

Permalink
Runner Fix nvidia setup and restart (#607)
Browse files Browse the repository at this point in the history
* Runner Fix nvidia setup and restart

* Restyled by shfmt

* fix tests

* sh nitpick

* revert go.mod

* fix tests after nitpick

* dialWithDeadline

* Apply suggestions from code review

* Apply suggestions from code review

* Update golden tests

Co-authored-by: Restyled.io <[email protected]>
Co-authored-by: Helio Machado <[email protected]>
  • Loading branch information
3 people authored Jun 21, 2022
1 parent 27d0daa commit 5194dc5
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 66 deletions.
22 changes: 12 additions & 10 deletions environment/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ if [ ! -f "$FILE" ]; then
sudo usermod -aG docker ubuntu
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock

get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"

curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
sudo apt update && sudo apt-get install -y terraform
Expand All @@ -25,16 +29,14 @@ if [ ! -f "$FILE" ]; then
sudo apt update && sudo apt-get install -y nodejs

sudo apt install -y ubuntu-drivers-common
sudo ubuntu-drivers autoinstall

get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"

curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2
sudo systemctl restart docker
if ubuntu-drivers devices | grep -q NVIDIA; then
sudo ubuntu-drivers install

curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2
sudo systemctl restart docker
fi

echo OK | sudo tee "$FILE"
fi
35 changes: 20 additions & 15 deletions iterative/resource_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func resourceRunner() *schema.Resource {
DeleteContext: resourceRunnerDelete,
ReadContext: resourceMachineRead,
Timeouts: &schema.ResourceTimeout{
Create: schema.DefaultTimeout(10 * time.Minute),
Create: schema.DefaultTimeout(20 * time.Minute),
Update: schema.DefaultTimeout(10 * time.Minute),
Delete: schema.DefaultTimeout(10 * time.Minute),
},
Expand Down Expand Up @@ -247,27 +247,30 @@ func resourceRunnerCreate(ctx context.Context, d *schema.ResourceData, m interfa

var logError error
var logEvents string
err = resource.Retry(d.Timeout(schema.TimeoutCreate), func() *resource.RetryError {
switch cloud := d.Get("cloud").(string); cloud {
cloud := d.Get("cloud").(string)
ip := d.Get("instance_ip").(string)
err = resource.Retry(d.Timeout(schema.TimeoutCreate)-time.Minute, func() *resource.RetryError {

switch cloud {
case "kubernetes":
logEvents, logError = resourceMachineLogs(ctx, d, m)
default:
logEvents, logError = utils.RunCommand("journalctl --unit cml --no-pager",
2*time.Second,
net.JoinHostPort(d.Get("instance_ip").(string), "22"),
net.JoinHostPort(ip, "22"),
"ubuntu",
d.Get("ssh_private").(string))
}

log.Printf("[DEBUG] Collected log events: %#v", logEvents)
log.Printf("[DEBUG] Connection errors: %#v", logError)

if logError != nil {
return resource.RetryableError(fmt.Errorf("Waiting for the machine to accept connections... %s", logError))
} else if utils.HasStatus(logEvents, "terminated") {
return resource.NonRetryableError(fmt.Errorf("Failed to launch the runner!"))
} else if utils.HasStatus(logEvents, "ready") {
return nil
log.Printf("[DEBUG] Connection errors: %#v", logError)
} else {
log.Printf("[DEBUG] Collected log events: %#v", logEvents)
if utils.HasStatus(logEvents, "terminated") {
return resource.NonRetryableError(fmt.Errorf("Failed to launch the runner!"))
} else if utils.HasStatus(logEvents, "ready") {
return nil
}
}

return resource.RetryableError(fmt.Errorf("Waiting for the runner to be ready..."))
Expand Down Expand Up @@ -374,9 +377,11 @@ EOF'
{{- if .cloud}}
sudo systemctl daemon-reload
sudo systemctl enable cml.service
{{- if .instance_gpu}}
nvidia-smi &>/dev/null || reboot
{{- end}}
if ubuntu-drivers devices | grep -q NVIDIA; then
(sudo modprobe nvidia && sudo nvidia-smi) || sudo reboot
fi
sudo systemctl start cml.service
{{- end}}
Expand Down
26 changes: 16 additions & 10 deletions iterative/testdata/script_template_cloud_aws.golden
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ if [ ! -f "$FILE" ]; then
sudo usermod -aG docker ubuntu
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock

get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"

curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
sudo apt update && sudo apt-get install -y terraform
Expand All @@ -26,16 +30,14 @@ if [ ! -f "$FILE" ]; then
sudo apt update && sudo apt-get install -y nodejs

sudo apt install -y ubuntu-drivers-common
sudo ubuntu-drivers autoinstall
if ubuntu-drivers devices | grep -q NVIDIA; then
sudo ubuntu-drivers install

get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"

curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2
sudo systemctl restart docker
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2
sudo systemctl restart docker
fi

echo OK | sudo tee "$FILE"
fi
Expand Down Expand Up @@ -73,5 +75,9 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
EOF'
sudo systemctl daemon-reload
sudo systemctl enable cml.service
nvidia-smi &>/dev/null || reboot

if ubuntu-drivers devices | grep -q NVIDIA; then
(sudo modprobe nvidia && sudo nvidia-smi) || sudo reboot
fi

sudo systemctl start cml.service
26 changes: 16 additions & 10 deletions iterative/testdata/script_template_cloud_azure.golden
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ if [ ! -f "$FILE" ]; then
sudo usermod -aG docker ubuntu
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock

get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"

curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
sudo apt update && sudo apt-get install -y terraform
Expand All @@ -26,16 +30,14 @@ if [ ! -f "$FILE" ]; then
sudo apt update && sudo apt-get install -y nodejs

sudo apt install -y ubuntu-drivers-common
sudo ubuntu-drivers autoinstall
if ubuntu-drivers devices | grep -q NVIDIA; then
sudo ubuntu-drivers install

get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"

curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2
sudo systemctl restart docker
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2
sudo systemctl restart docker
fi

echo OK | sudo tee "$FILE"
fi
Expand Down Expand Up @@ -74,5 +76,9 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
EOF'
sudo systemctl daemon-reload
sudo systemctl enable cml.service
nvidia-smi &>/dev/null || reboot

if ubuntu-drivers devices | grep -q NVIDIA; then
(sudo modprobe nvidia && sudo nvidia-smi) || sudo reboot
fi

sudo systemctl start cml.service
26 changes: 16 additions & 10 deletions iterative/testdata/script_template_cloud_gcp.golden
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ if [ ! -f "$FILE" ]; then
sudo usermod -aG docker ubuntu
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock

get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"

curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
sudo apt update && sudo apt-get install -y terraform
Expand All @@ -26,16 +30,14 @@ if [ ! -f "$FILE" ]; then
sudo apt update && sudo apt-get install -y nodejs

sudo apt install -y ubuntu-drivers-common
sudo ubuntu-drivers autoinstall
if ubuntu-drivers devices | grep -q NVIDIA; then
sudo ubuntu-drivers install

get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"

curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2
sudo systemctl restart docker
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2
sudo systemctl restart docker
fi

echo OK | sudo tee "$FILE"
fi
Expand Down Expand Up @@ -72,5 +74,9 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
EOF'
sudo systemctl daemon-reload
sudo systemctl enable cml.service
nvidia-smi &>/dev/null || reboot

if ubuntu-drivers devices | grep -q NVIDIA; then
(sudo modprobe nvidia && sudo nvidia-smi) || sudo reboot
fi

sudo systemctl start cml.service
26 changes: 16 additions & 10 deletions iterative/testdata/script_template_cloud_invalid.golden
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ if [ ! -f "$FILE" ]; then
sudo usermod -aG docker ubuntu
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock

get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"

curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
sudo apt update && sudo apt-get install -y terraform
Expand All @@ -26,16 +30,14 @@ if [ ! -f "$FILE" ]; then
sudo apt update && sudo apt-get install -y nodejs

sudo apt install -y ubuntu-drivers-common
sudo ubuntu-drivers autoinstall
if ubuntu-drivers devices | grep -q NVIDIA; then
sudo ubuntu-drivers install

get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"

curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2
sudo systemctl restart docker
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-docker2
sudo systemctl restart docker
fi

echo OK | sudo tee "$FILE"
fi
Expand Down Expand Up @@ -70,5 +72,9 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
EOF'
sudo systemctl daemon-reload
sudo systemctl enable cml.service
nvidia-smi &>/dev/null || reboot

if ubuntu-drivers devices | grep -q NVIDIA; then
(sudo modprobe nvidia && sudo nvidia-smi) || sudo reboot
fi

sudo systemctl start cml.service
21 changes: 20 additions & 1 deletion iterative/utils/ssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"crypto/x509"
"encoding/pem"
"fmt"
"net"
"strings"
"time"

Expand Down Expand Up @@ -64,7 +65,7 @@ func RunCommand(command string, timeout time.Duration, hostAddress string, userN
Timeout: timeout,
}

client, err := ssh.Dial("tcp", hostAddress, configuration)
client, err := dialWithDeadline("tcp", hostAddress, configuration)
if err != nil {
return "", err
}
Expand All @@ -83,3 +84,21 @@ func RunCommand(command string, timeout time.Duration, hostAddress string, userN

return string(output), nil
}

func dialWithDeadline(network string, addr string, config *ssh.ClientConfig) (*ssh.Client, error) {
conn, err := net.DialTimeout(network, addr, config.Timeout)
if err != nil {
return nil, err
}
if config.Timeout > 0 {
conn.SetReadDeadline(time.Now().Add(config.Timeout))
}
c, chans, reqs, err := ssh.NewClientConn(conn, addr, config)
if err != nil {
return nil, err
}
if config.Timeout > 0 {
conn.SetReadDeadline(time.Time{})
}
return ssh.NewClient(c, chans, reqs), nil
}

0 comments on commit 5194dc5

Please sign in to comment.