From 7504c7f30b8f4e542c64a9f070802792be71d93a Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 26 Mar 2020 21:31:05 +0000 Subject: [PATCH] mantle/qemu: Support catching initramfs failures Pairs with https://github.com/coreos/ignition-dracut/pull/146 This way, we error out fast if something went wrong in the initramfs rather than timing out. And further, we get the journal as JSON, so we can do something intelligent in the future to analyze it. And add a test case for this. --- mantle/cmd/kola/qemuexec.go | 18 ++- mantle/cmd/kola/testiso.go | 2 +- mantle/kola/tests/ignition/qemufailure.go | 105 ++++++++++++++++++ mantle/platform/machine/aws/machine.go | 4 + mantle/platform/machine/azure/machine.go | 4 + mantle/platform/machine/do/machine.go | 4 + mantle/platform/machine/esx/machine.go | 4 + mantle/platform/machine/gcloud/machine.go | 4 + mantle/platform/machine/openstack/machine.go | 4 + mantle/platform/machine/packet/machine.go | 4 + mantle/platform/machine/unprivqemu/machine.go | 9 ++ mantle/platform/platform.go | 3 + mantle/platform/qemu.go | 77 +++++++++++++ mantle/platform/util.go | 14 ++- 14 files changed, 249 insertions(+), 7 deletions(-) create mode 100644 mantle/kola/tests/ignition/qemufailure.go diff --git a/mantle/cmd/kola/qemuexec.go b/mantle/cmd/kola/qemuexec.go index 7fa62c4b35..996a4cf240 100644 --- a/mantle/cmd/kola/qemuexec.go +++ b/mantle/cmd/kola/qemuexec.go @@ -50,7 +50,8 @@ var ( ignitionFragments []string - forceConfigInjection bool + forceConfigInjection bool + propagateInitramfsFailure bool ) func init() { @@ -63,6 +64,8 @@ func init() { cmdQemuExec.Flags().IntVarP(&memory, "memory", "m", 0, "Memory in MB") cmdQemuExec.Flags().StringVarP(&ignition, "ignition", "i", "", "Path to ignition config") cmdQemuExec.Flags().BoolVarP(&forceConfigInjection, "inject-ignition", "", false, "Force injecting Ignition config using guestfs") + cmdQemuExec.Flags().BoolVar(&propagateInitramfsFailure, "propagate-initramfs-failure", false, "Error out if the system fails in the initramfs") + } func renderFragments(config v3types.Config) (*v3types.Config, error) { @@ -141,8 +144,13 @@ func runQemuExec(cmd *cobra.Command, args []string) error { return err } - // Ignore errors - _ = inst.Wait() - - return nil + if propagateInitramfsFailure { + err = inst.WaitAll() + if err != nil { + return err + } + return nil + } else { + return inst.Wait() + } } diff --git a/mantle/cmd/kola/testiso.go b/mantle/cmd/kola/testiso.go index dbaeeebd31..9d49f255b6 100644 --- a/mantle/cmd/kola/testiso.go +++ b/mantle/cmd/kola/testiso.go @@ -202,7 +202,7 @@ func testPXE(inst platform.Install, completionfile string) error { } defer mach.Destroy() - err = mach.QemuInst.Wait() + err = mach.QemuInst.WaitAll() if err != nil { return err } diff --git a/mantle/kola/tests/ignition/qemufailure.go b/mantle/kola/tests/ignition/qemufailure.go new file mode 100644 index 0000000000..e2c6c18eb2 --- /dev/null +++ b/mantle/kola/tests/ignition/qemufailure.go @@ -0,0 +1,105 @@ +// Copyright 2020 Red Hat, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ignition + +import ( + "fmt" + "os" + "time" + + ignv3types "github.com/coreos/ignition/v2/config/v3_0/types" + "github.com/pkg/errors" + + "github.com/coreos/mantle/kola" + "github.com/coreos/mantle/kola/cluster" + "github.com/coreos/mantle/kola/register" + "github.com/coreos/mantle/platform" + "github.com/coreos/mantle/util" +) + +func init() { + register.RegisterTest(®ister.Test{ + Name: "coreos.ignition.failure", + Run: runIgnitionFailure, + ClusterSize: 0, + Platforms: []string{"qemu-unpriv"}, + }) +} + +func runIgnitionFailure(c cluster.TestCluster) { + if err := ignitionFailure(c); err != nil { + c.Fatal(err.Error()) + } +} + +func ignitionFailure(c cluster.TestCluster) error { + // TODO remove this once the feature lands + c.H.Skip("Needs https://github.com/coreos/ignition-dracut/pull/146") + // We can't create files in / due to the immutable bit OSTree creates, so + // this is a convenient way to test Ignition failure. + failConfig := ignv3types.Config{ + Ignition: ignv3types.Ignition{ + Version: "3.0.0", + }, + Storage: ignv3types.Storage{ + Files: []ignv3types.File{ + { + Node: ignv3types.Node{ + Path: "/notwritable.txt", + }, + FileEmbedded1: ignv3types.FileEmbedded1{ + Contents: ignv3types.FileContents{ + Source: util.StrToPtr("data:,hello%20world%0A"), + }, + Mode: util.IntToPtr(420), + }, + }, + }, + }, + } + builder := platform.NewBuilder() + builder.SetConfig(failConfig, kola.Options.IgnitionVersion == "v2") + builder.AddPrimaryDisk(&platform.Disk{ + BackingFile: kola.QEMUOptions.DiskImage, + }) + builder.Memory = 1024 + inst, err := builder.Exec() + if err != nil { + return err + } + defer inst.Destroy() + errchan := make(chan error) + go func() { + err := inst.WaitAll() + if err == nil { + err = fmt.Errorf("Ignition unexpectedly succeeded") + } else if err == platform.ErrInitramfsEmergency { + // The expected case + err = nil + } else { + err = errors.Wrapf(err, "expected initramfs emergency.target error") + } + errchan <- err + }() + go func() { + time.Sleep(2 * time.Minute) + proc := os.Process{ + Pid: inst.Pid(), + } + proc.Kill() + errchan <- errors.New("timed out waiting for initramfs error") + }() + return <-errchan +} diff --git a/mantle/platform/machine/aws/machine.go b/mantle/platform/machine/aws/machine.go index f97705d7a4..e178917605 100644 --- a/mantle/platform/machine/aws/machine.go +++ b/mantle/platform/machine/aws/machine.go @@ -64,6 +64,10 @@ func (am *machine) SSH(cmd string) ([]byte, []byte, error) { return am.cluster.SSH(am, cmd) } +func (am *machine) IgnitionError() error { + return nil +} + func (am *machine) Reboot() error { return platform.RebootMachine(am, am.journal) } diff --git a/mantle/platform/machine/azure/machine.go b/mantle/platform/machine/azure/machine.go index b9ef0002dd..3d58847a41 100644 --- a/mantle/platform/machine/azure/machine.go +++ b/mantle/platform/machine/azure/machine.go @@ -74,6 +74,10 @@ func (am *machine) SSH(cmd string) ([]byte, []byte, error) { return am.cluster.SSH(am, cmd) } +func (am *machine) IgnitionError() error { + return nil +} + // Re-fetch the Public & Private IP address for the event that it's changed during the reboot func (am *machine) refetchIPs() error { var err error diff --git a/mantle/platform/machine/do/machine.go b/mantle/platform/machine/do/machine.go index beff9df18f..24e413e609 100644 --- a/mantle/platform/machine/do/machine.go +++ b/mantle/platform/machine/do/machine.go @@ -61,6 +61,10 @@ func (dm *machine) SSH(cmd string) ([]byte, []byte, error) { return dm.cluster.SSH(dm, cmd) } +func (dm *machine) IgnitionError() error { + return nil +} + func (dm *machine) Reboot() error { return platform.RebootMachine(dm, dm.journal) } diff --git a/mantle/platform/machine/esx/machine.go b/mantle/platform/machine/esx/machine.go index 793fff998d..b6811c0adb 100644 --- a/mantle/platform/machine/esx/machine.go +++ b/mantle/platform/machine/esx/machine.go @@ -62,6 +62,10 @@ func (em *machine) SSH(cmd string) ([]byte, []byte, error) { return em.cluster.SSH(em, cmd) } +func (em *machine) IgnitionError() error { + return nil +} + func (em *machine) Reboot() error { return platform.RebootMachine(em, em.journal) } diff --git a/mantle/platform/machine/gcloud/machine.go b/mantle/platform/machine/gcloud/machine.go index d57bcc7c8f..89f32d8358 100644 --- a/mantle/platform/machine/gcloud/machine.go +++ b/mantle/platform/machine/gcloud/machine.go @@ -62,6 +62,10 @@ func (gm *machine) SSH(cmd string) ([]byte, []byte, error) { return gm.gc.SSH(gm, cmd) } +func (gm *machine) IgnitionError() error { + return nil +} + func (gm *machine) Reboot() error { return platform.RebootMachine(gm, gm.journal) } diff --git a/mantle/platform/machine/openstack/machine.go b/mantle/platform/machine/openstack/machine.go index 7d19b47062..33655251ad 100644 --- a/mantle/platform/machine/openstack/machine.go +++ b/mantle/platform/machine/openstack/machine.go @@ -83,6 +83,10 @@ func (om *machine) SSH(cmd string) ([]byte, []byte, error) { return om.cluster.SSH(om, cmd) } +func (om *machine) IgnitionError() error { + return nil +} + func (om *machine) Reboot() error { return platform.RebootMachine(om, om.journal) } diff --git a/mantle/platform/machine/packet/machine.go b/mantle/platform/machine/packet/machine.go index 10768b1300..de6bb1384f 100644 --- a/mantle/platform/machine/packet/machine.go +++ b/mantle/platform/machine/packet/machine.go @@ -61,6 +61,10 @@ func (pm *machine) SSH(cmd string) ([]byte, []byte, error) { return pm.cluster.SSH(pm, cmd) } +func (pm *machine) IgnitionError() error { + return nil +} + func (pm *machine) Reboot() error { return platform.RebootMachine(pm, pm.journal) } diff --git a/mantle/platform/machine/unprivqemu/machine.go b/mantle/platform/machine/unprivqemu/machine.go index 7fb8a9b093..d9aeeec2b5 100644 --- a/mantle/platform/machine/unprivqemu/machine.go +++ b/mantle/platform/machine/unprivqemu/machine.go @@ -61,6 +61,15 @@ func (m *machine) SSH(cmd string) ([]byte, []byte, error) { return m.qc.SSH(m, cmd) } +func (m *machine) IgnitionError() error { + _, err := m.inst.WaitIgnitionError() + if err != nil { + return err + } + // TODO render buf + return platform.ErrInitramfsEmergency +} + func (m *machine) Reboot() error { return platform.RebootMachine(m, m.journal) } diff --git a/mantle/platform/platform.go b/mantle/platform/platform.go index a723d0a447..aca335fc3a 100644 --- a/mantle/platform/platform.go +++ b/mantle/platform/platform.go @@ -51,6 +51,9 @@ type Machine interface { // ID returns the plaform-specific machine identifier. ID() string + // IgnitionError returns an error if the machine failed in Ignition + IgnitionError() error + // IP returns the machine's public IP. IP() string diff --git a/mantle/platform/qemu.go b/mantle/platform/qemu.go index 6c90d3e6a9..95c267f4b2 100644 --- a/mantle/platform/qemu.go +++ b/mantle/platform/qemu.go @@ -15,6 +15,7 @@ package platform import ( + "bufio" "encoding/json" "fmt" "io" @@ -34,6 +35,10 @@ import ( "github.com/pkg/errors" ) +var ( + ErrInitramfsEmergency = errors.New("entered emergency.target in initramfs") +) + type MachineOptions struct { AdditionalDisks []Disk } @@ -51,6 +56,8 @@ type QemuInstance struct { tmpConfig string swtpmTmpd string swtpm exec.Cmd + + journalPipe *os.File } func (inst *QemuInstance) Pid() int { @@ -115,14 +122,73 @@ func (inst *QemuInstance) SSHAddress() (string, error) { return "", fmt.Errorf("didn't find an address") } +// Wait for the qemu process to exit func (inst *QemuInstance) Wait() error { return inst.qemu.Wait() } +// WaitIgnitionError will only return if the instance +// failed inside the initramfs. The resulting string will +// be a newline-delimited stream of JSON strings, as returned +// by `journalctl -o json`. +func (inst *QemuInstance) WaitIgnitionError() (string, error) { + b := bufio.NewReaderSize(inst.journalPipe, 64768) + var r strings.Builder + iscorrupted := false + _, err := b.Peek(1) + if err != nil { + if err == io.EOF { + return "", nil + } + return "", errors.Wrapf(err, "Reading from journal") + } + for { + line, prefix, err := b.ReadLine() + if err != nil { + return r.String(), errors.Wrapf(err, "Reading from journal channel") + } + if prefix { + iscorrupted = true + } + if len(line) == 0 || string(line) == "{}" { + break + } + r.Write(line) + } + if iscorrupted { + return r.String(), fmt.Errorf("journal was truncated due to overly long line") + } + return r.String(), nil +} + +// WaitAll wraps the process exit as well as WaitIgnitionError, +// returning an error if either fail. +func (inst *QemuInstance) WaitAll() error { + c := make(chan error) + go func() { + buf, err := inst.WaitIgnitionError() + if err != nil { + c <- err + } else { + // TODO parse buf and try to nicely render something + if buf != "" { + c <- ErrInitramfsEmergency + } + } + }() + go func() { + c <- inst.Wait() + }() + return <-c +} + func (inst *QemuInstance) Destroy() { if inst.tmpConfig != "" { os.Remove(inst.tmpConfig) } + if inst.journalPipe != nil { + inst.journalPipe.Close() + } if inst.qemu != nil { if err := inst.qemu.Kill(); err != nil { plog.Errorf("Error killing qemu instance %v: %v", inst.Pid(), err) @@ -728,6 +794,17 @@ func (builder *QemuBuilder) Exec() (*QemuInstance, error) { "-tpmdev", "emulator,id=tpm0,chardev=chrtpm", "-device", "tpm-tis,tpmdev=tpm0") } + // Set up the virtio channel to get Ignition failures by default + journalPipeR, journalPipeW, err := os.Pipe() + if err != nil { + return nil, errors.Wrapf(err, "creating journal pipe") + } + inst.journalPipe = journalPipeR + argv = append(argv, "-device", "virtio-serial") + // https://www.redhat.com/archives/libvir-list/2015-December/msg00305.html + argv = append(argv, "-chardev", fmt.Sprintf("file,id=ignition-dracut,path=%s,append=on", builder.AddFd(journalPipeW))) + argv = append(argv, "-device", "virtserialport,chardev=ignition-dracut,name=com.coreos.ignition.journal") + fdnum := 3 // first additional file starts at position 3 for i, _ := range builder.fds { fdset := i + 1 // Start at 1 diff --git a/mantle/platform/util.go b/mantle/platform/util.go index f7c97dd272..4b48d37877 100644 --- a/mantle/platform/util.go +++ b/mantle/platform/util.go @@ -183,7 +183,19 @@ func StartMachineAfterReboot(m Machine, j *Journal, oldBootId string) error { // StartMachine will start a given machine, provided the machine's journal. func StartMachine(m Machine, j *Journal) error { - return StartMachineAfterReboot(m, j, "") + errchan := make(chan error) + go func() { + err := m.IgnitionError() + if err != nil { + plog.Infof("machine %s entered emergency.target in initramfs: %v", m.ID(), err) + errchan <- err + } + }() + go func() { + // This one ends up connecting to the journal via ssh + errchan <- StartMachineAfterReboot(m, j, "") + }() + return <-errchan } func GetMachineBootId(m Machine) (string, error) {