-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Qemu driver: graceful shutdown feature #3411
Changes from 8 commits
00e3cc8
3f6fdfc
16a3614
22f390d
200a12c
1ff9703
2924bad
60030d8
15d7565
1856585
66f9840
f734d84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,8 +3,10 @@ package driver | |
import ( | ||
"context" | ||
"encoding/json" | ||
"errors" | ||
"fmt" | ||
"log" | ||
"net" | ||
"os" | ||
"os/exec" | ||
"path/filepath" | ||
|
@@ -13,7 +15,8 @@ import ( | |
"strings" | ||
"time" | ||
|
||
"github.com/hashicorp/go-plugin" | ||
"github.com/coreos/go-semver/semver" | ||
plugin "github.com/hashicorp/go-plugin" | ||
"github.com/hashicorp/nomad/client/config" | ||
"github.com/hashicorp/nomad/client/driver/executor" | ||
dstructs "github.com/hashicorp/nomad/client/driver/structs" | ||
|
@@ -29,9 +32,15 @@ var ( | |
) | ||
|
||
const ( | ||
// The key populated in Node Attributes to indicate presence of the Qemu | ||
// driver | ||
qemuDriverAttr = "driver.qemu" | ||
// The key populated in Node Attributes to indicate presence of the Qemu driver | ||
qemuDriverAttr = "driver.qemu" | ||
qemuDriverVersionAttr = "driver.qemu.version" | ||
qemuDriverLongMonitorPathAttr = "driver.qemu.longsocketpaths" | ||
// Represents an ACPI shutdown request to the VM (emulates pressing a physical power button) | ||
// Reference: https://en.wikibooks.org/wiki/QEMU/Monitor | ||
qemuGracefulShutdownMsg = "system_powerdown\n" | ||
legacyMaxMonitorPathLen = 108 | ||
qemuMonitorSocketName = "qemu-monitor.sock" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about windows? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question! Until today, I didn't realize QEMU was available on Windows hosts. I would need to spin up an EC2 instance and experiment a bit with QEMU monitor interaction on that platform. It appears an existing QEMU driver option can cause trouble on Windows hosts ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I think that is acceptable. Would you mind adding error cases for both graceful and kvm on windows. |
||
) | ||
|
||
// QemuDriver is a driver for running images via Qemu | ||
|
@@ -45,17 +54,19 @@ type QemuDriver struct { | |
} | ||
|
||
type QemuDriverConfig struct { | ||
ImagePath string `mapstructure:"image_path"` | ||
Accelerator string `mapstructure:"accelerator"` | ||
PortMap []map[string]int `mapstructure:"port_map"` // A map of host port labels and to guest ports. | ||
Args []string `mapstructure:"args"` // extra arguments to qemu executable | ||
ImagePath string `mapstructure:"image_path"` | ||
Accelerator string `mapstructure:"accelerator"` | ||
GracefulShutdown bool `mapstructure:"graceful_shutdown"` | ||
PortMap []map[string]int `mapstructure:"port_map"` // A map of host port labels and to guest ports. | ||
Args []string `mapstructure:"args"` // extra arguments to qemu executable | ||
} | ||
|
||
// qemuHandle is returned from Start/Open as a handle to the PID | ||
type qemuHandle struct { | ||
pluginClient *plugin.Client | ||
userPid int | ||
executor executor.Executor | ||
monitorPath string | ||
killTimeout time.Duration | ||
maxKillTimeout time.Duration | ||
logger *log.Logger | ||
|
@@ -64,6 +75,13 @@ type qemuHandle struct { | |
doneCh chan struct{} | ||
} | ||
|
||
func getMonitorPath(dir string, longPathSupport string) (string, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment on this method. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! |
||
if len(dir) > legacyMaxMonitorPathLen && longPathSupport != "1" { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of taking the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! |
||
return "", fmt.Errorf("monitor path is too long") | ||
} | ||
return fmt.Sprintf("%s/%s", dir, qemuMonitorSocketName), nil | ||
} | ||
|
||
// NewQemuDriver is used to create a new exec driver | ||
func NewQemuDriver(ctx *DriverContext) Driver { | ||
return &QemuDriver{DriverContext: *ctx} | ||
|
@@ -81,6 +99,10 @@ func (d *QemuDriver) Validate(config map[string]interface{}) error { | |
"accelerator": { | ||
Type: fields.TypeString, | ||
}, | ||
"graceful_shutdown": { | ||
Type: fields.TypeBool, | ||
Required: false, | ||
}, | ||
"port_map": { | ||
Type: fields.TypeArray, | ||
}, | ||
|
@@ -127,9 +149,26 @@ func (d *QemuDriver) Fingerprint(cfg *config.Config, node *structs.Node) (bool, | |
delete(node.Attributes, qemuDriverAttr) | ||
return false, fmt.Errorf("Unable to parse Qemu version string: %#v", matches) | ||
} | ||
currentQemuVersion := matches[1] | ||
|
||
node.Attributes[qemuDriverAttr] = "1" | ||
node.Attributes["driver.qemu.version"] = matches[1] | ||
node.Attributes[qemuDriverVersionAttr] = currentQemuVersion | ||
|
||
// Prior to qemu 2.10.1, monitor socket paths are truncated to 108 bytes. | ||
// We should consider this if driver.qemu.version is < 2.10.1 and the | ||
// generated monitor path is too long. | ||
// | ||
// Relevant fix is here: | ||
// https://github.com/qemu/qemu/commit/ad9579aaa16d5b385922d49edac2c96c79bcfb6 | ||
currentQemuSemver := semver.New(currentQemuVersion) | ||
fixedSocketPathLenVer := semver.New("2.10.1") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you make this a variable with a comment. Can be a singleton. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! |
||
if currentQemuSemver.LessThan(*fixedSocketPathLenVer) { | ||
node.Attributes[qemuDriverLongMonitorPathAttr] = "0" | ||
d.logger.Printf("[DEBUG] driver.qemu: long socket paths are not available in this version of QEMU (%s)", currentQemuVersion) | ||
} else { | ||
d.logger.Printf("[DEBUG] driver.qemu: long socket paths available in this version of QEMU (%s)", currentQemuVersion) | ||
node.Attributes[qemuDriverLongMonitorPathAttr] = "1" | ||
} | ||
return true, nil | ||
} | ||
|
||
|
@@ -190,6 +229,19 @@ func (d *QemuDriver) Start(ctx *ExecContext, task *structs.Task) (*StartResponse | |
"-nographic", | ||
} | ||
|
||
var monitorPath string | ||
if d.driverConfig.GracefulShutdown { | ||
// This socket will be used to manage the virtual machine (for example, | ||
// to perform graceful shutdowns) | ||
monitorPath, err = getMonitorPath(ctx.TaskDir.Dir, ctx.TaskEnv.NodeAttrs[qemuDriverLongMonitorPathAttr]) | ||
if err == nil { | ||
d.logger.Printf("[DEBUG] driver.qemu: got monitor path OK: %s", monitorPath) | ||
args = append(args, "-monitor", fmt.Sprintf("unix:%s,server,nowait", monitorPath)) | ||
} else { | ||
d.logger.Printf("[WARN] driver.qemu: %s", err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should fail the task. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! This will now return (and log) an error. |
||
} | ||
} | ||
|
||
// Add pass through arguments to qemu executable. A user can specify | ||
// these arguments in driver task configuration. These arguments are | ||
// passed directly to the qemu driver as command line options. | ||
|
@@ -239,7 +291,7 @@ func (d *QemuDriver) Start(ctx *ExecContext, task *structs.Task) (*StartResponse | |
) | ||
} | ||
|
||
d.logger.Printf("[DEBUG] Starting QemuVM command: %q", strings.Join(args, " ")) | ||
d.logger.Printf("[DEBUG] driver.qemu: starting QemuVM command: %q", strings.Join(args, " ")) | ||
pluginLogFile := filepath.Join(ctx.TaskDir.Dir, "executor.out") | ||
executorConfig := &dstructs.ExecutorConfig{ | ||
LogFile: pluginLogFile, | ||
|
@@ -272,7 +324,7 @@ func (d *QemuDriver) Start(ctx *ExecContext, task *structs.Task) (*StartResponse | |
pluginClient.Kill() | ||
return nil, err | ||
} | ||
d.logger.Printf("[INFO] Started new QemuVM: %s", vmID) | ||
d.logger.Printf("[INFO] driver.qemu: started new QemuVM: %s", vmID) | ||
|
||
// Create and Return Handle | ||
maxKill := d.DriverContext.config.MaxKillTimeout | ||
|
@@ -282,6 +334,7 @@ func (d *QemuDriver) Start(ctx *ExecContext, task *structs.Task) (*StartResponse | |
userPid: ps.Pid, | ||
killTimeout: GetKillTimeout(task.KillTimeout, maxKill), | ||
maxKillTimeout: maxKill, | ||
monitorPath: monitorPath, | ||
version: d.config.Version.VersionNumber(), | ||
logger: d.logger, | ||
doneCh: make(chan struct{}), | ||
|
@@ -308,7 +361,7 @@ type qemuId struct { | |
func (d *QemuDriver) Open(ctx *ExecContext, handleID string) (DriverHandle, error) { | ||
id := &qemuId{} | ||
if err := json.Unmarshal([]byte(handleID), id); err != nil { | ||
return nil, fmt.Errorf("Failed to parse handle '%s': %v", handleID, err) | ||
return nil, fmt.Errorf("Failed to parse handle %q: %v", handleID, err) | ||
} | ||
|
||
pluginConfig := &plugin.ClientConfig{ | ||
|
@@ -317,9 +370,9 @@ func (d *QemuDriver) Open(ctx *ExecContext, handleID string) (DriverHandle, erro | |
|
||
exec, pluginClient, err := createExecutorWithConfig(pluginConfig, d.config.LogOutput) | ||
if err != nil { | ||
d.logger.Println("[ERR] driver.qemu: error connecting to plugin so destroying plugin pid and user pid") | ||
d.logger.Printf("[ERR] driver.qemu: error connecting to plugin so destroying plugin pid %d and user pid %d", id.PluginConfig.Pid, id.UserPid) | ||
if e := destroyPlugin(id.PluginConfig.Pid, id.UserPid); e != nil { | ||
d.logger.Printf("[ERR] driver.qemu: error destroying plugin and userpid: %v", e) | ||
d.logger.Printf("[ERR] driver.qemu: error destroying plugin pid %d and userpid %d: %v", id.PluginConfig.Pid, id.UserPid, e) | ||
} | ||
return nil, fmt.Errorf("error connecting to plugin: %v", err) | ||
} | ||
|
@@ -381,27 +434,36 @@ func (h *qemuHandle) Signal(s os.Signal) error { | |
return fmt.Errorf("Qemu driver can't send signals") | ||
} | ||
|
||
// TODO: allow a 'shutdown_command' that can be executed over a ssh connection | ||
// to the VM | ||
func (h *qemuHandle) Kill() error { | ||
if err := h.executor.ShutDown(); err != nil { | ||
if h.pluginClient.Exited() { | ||
return nil | ||
// First, try sending a graceful shutdown command via the qemu monitor | ||
err := sendQemuShutdown(h.logger, h.monitorPath, h.userPid) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wrap this in a if check seeing if graceful shutdown is enabled and log the error There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! |
||
|
||
// If we did not send a graceful shutdown via the monitor socket, we'll | ||
// issue an interrupt to the qemu process as a last resort | ||
if err != nil { | ||
if err := h.executor.ShutDown(); err != nil { | ||
if h.pluginClient.Exited() { | ||
return nil | ||
} | ||
return fmt.Errorf("executor Shutdown failed: %v", err) | ||
} | ||
return fmt.Errorf("executor Shutdown failed: %v", err) | ||
} | ||
|
||
// If the qemu process exits before the kill timeout is reached, doneChan | ||
// will close and we'll exit without an error. If it takes too long, the | ||
// timer will fire and we'll attempt to kill the process. | ||
select { | ||
case <-h.doneCh: | ||
return nil | ||
case <-time.After(h.killTimeout): | ||
h.logger.Printf("[DEBUG] driver.qemu: kill timeout of %s exceeded for user process pid %d", h.killTimeout.String(), h.userPid) | ||
|
||
if h.pluginClient.Exited() { | ||
return nil | ||
} | ||
if err := h.executor.Exit(); err != nil { | ||
return fmt.Errorf("executor Exit failed: %v", err) | ||
} | ||
|
||
return nil | ||
} | ||
} | ||
|
@@ -414,7 +476,7 @@ func (h *qemuHandle) run() { | |
ps, werr := h.executor.Wait() | ||
if ps.ExitCode == 0 && werr != nil { | ||
if e := killProcess(h.userPid); e != nil { | ||
h.logger.Printf("[ERR] driver.qemu: error killing user process: %v", e) | ||
h.logger.Printf("[ERR] driver.qemu: error killing user process pid %d: %v", h.userPid, e) | ||
} | ||
} | ||
close(h.doneCh) | ||
|
@@ -427,3 +489,25 @@ func (h *qemuHandle) run() { | |
h.waitCh <- &dstructs.WaitResult{ExitCode: ps.ExitCode, Signal: ps.Signal, Err: werr} | ||
close(h.waitCh) | ||
} | ||
|
||
func sendQemuShutdown(logger *log.Logger, monitorPath string, userPid int) error { | ||
var err error | ||
if monitorPath == "" { | ||
logger.Printf("[DEBUG] driver.qemu: monitorPath not set; will not attempt graceful shutdown for user process pid %d", userPid) | ||
err = errors.New("monitorPath not set") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could return the error here and skip the Not a blocker. |
||
} else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can remove the else statement since the if returns. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! |
||
var monitorSocket net.Conn | ||
monitorSocket, err = net.Dial("unix", monitorPath) | ||
if err == nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Swap the checking. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! |
||
defer monitorSocket.Close() | ||
logger.Printf("[DEBUG] driver.qemu: sending graceful shutdown command to qemu monitor socket %q for user process pid %d", monitorPath, userPid) | ||
_, err = monitorSocket.Write([]byte(qemuGracefulShutdownMsg)) | ||
if err != nil { | ||
logger.Printf("[WARN] driver.qemu: failed to send shutdown message %q to monitor socket %q for user process pid %d: %s", qemuGracefulShutdownMsg, monitorPath, userPid, err) | ||
} | ||
} else { | ||
logger.Printf("[WARN] driver.qemu: could not connect to qemu monitor %q for user process pid %d: %s", monitorPath, userPid, err) | ||
} | ||
} | ||
return err | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not a fan of this being an attribute. I think we can just check in the start method
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For scheduling purposes, constraining on this can be replicated by: https://www.nomadproject.io/docs/job-specification/constraint.html#quot-version-quot-
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - I've removed this attribute as part of the changes to
getMonitorPath
!