coreos · cgwalters · Mar 2, 2021 · jlebon · Mar 8, 2021 · jlebon
diff --git a/ci/run-kola-self-tests b/ci/run-kola-self-tests
@@ -3,6 +3,20 @@ set -euo pipefail
 dn=$(cd $(dirname $0) && pwd)
 toplevel=$(cd ${dn} && git rev-parse --show-toplevel)
 set -x
+# Unit failure
+if cosa kola run --output-dir tmp/kolaself -E ${toplevel}/tests/kola-ci-unit-failure 'ext.kola-ci*' |& tee tmp/err.txt; then
+    echo "expected kola-ci-unit-failure to fail" 1>&2
+    exit 1
+fi
+if ! grep -qF 'failed basic checks' tmp/err.txt; then
+    echo "Missing expected failed basic checks error" 1>&2
+    exit 1
+fi
+# And this run should pass
+cosa kola run --output-dir tmp/kolaself -E ${toplevel}/tests/kola-ci-unit-failure 'ext.kola-ci*' --no-default-checks
+
+rm tmp/kolaself -rf
+
 cosa kola run --output-dir tmp/kolaself -E ${toplevel}/tests/kola-ci-self 'ext.kola-ci-self*' "$@"
 log=$(find tmp/kolaself -type f -name kola-runext-autopkgtest-reboot.txt)
 test -n "${log}"

diff --git a/mantle/cmd/kola/options.go b/mantle/cmd/kola/options.go
@@ -63,6 +63,7 @@ func init() {
 	bv(&kola.NoNet, "no-net", false, "Don't run tests that require an Internet connection")
 	ssv(&kola.Tags, "tag", []string{}, "Test tag to run. Can be specified multiple times.")
 	bv(&kola.Options.SSHOnTestFailure, "ssh-on-test-failure", false, "SSH into a machine when tests fail")
+	bv(&kola.Options.SuppressDefaultChecks, "no-default-checks", false, "Disable default checks for failed systemd units and SELinux AVC denials")
 	sv(&kola.Options.Stream, "stream", "", "CoreOS stream ID (e.g. for Fedora CoreOS: stable, testing, next)")
 	sv(&kola.Options.CosaWorkdir, "workdir", "", "coreos-assembler working directory")
 	sv(&kola.Options.CosaBuildId, "build", "", "coreos-assembler build ID")

diff --git a/mantle/kola/harness.go b/mantle/kola/harness.go
@@ -16,6 +16,7 @@ package kola
 
 import (
 	"bufio"
+	"context"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -883,9 +884,12 @@ func getClusterSemver(flight platform.Flight, outputDir string) (*semver.Version
 		return nil, err
 	}
 
-	cluster, err := flight.NewCluster(&platform.RuntimeConfig{
-		OutputDir: testDir,
-	})
+	cfg := &platform.RuntimeConfig{
+		OutputDir:        testDir,
+		AllowFailedUnits: Options.SuppressDefaultChecks,
+	}
+
+	cluster, err := flight.NewCluster(cfg)
 	if err != nil {
 		return nil, errors.Wrapf(err, "creating cluster for semver check")
 	}
@@ -933,6 +937,7 @@ func runTest(h *harness.H, t *register.Test, pltfrm string, flight platform.Flig
 		NoSSHKeyInUserData: t.HasFlag(register.NoSSHKeyInUserData),
 		NoSSHKeyInMetadata: t.HasFlag(register.NoSSHKeyInMetadata),
 		InternetAccess:     testRequiresInternet(t),
+		AllowFailedUnits:   Options.SuppressDefaultChecks,
 	}
 	c, err := flight.NewCluster(rconf)
 	if err != nil {
@@ -1036,6 +1041,12 @@ func runTest(h *harness.H, t *register.Test, pltfrm string, flight platform.Flig
 
 	// run test
 	t.Run(tcluster)
+
+	for _, mach := range c.Machines() {
+		if err := platform.CheckMachineBasics(context.TODO(), mach); err != nil {
+			h.Errorf("Machine %s failed basic checks: %v", mach.ID(), err)
+		}
+	}
 }
 
 // scpKolet searches for a kolet binary and copies it to the machine.

diff --git a/mantle/platform/platform.go b/mantle/platform/platform.go
@@ -186,6 +186,8 @@ type Options struct {
 	OSContainer string
 
 	SSHOnTestFailure bool
+	// SuppressDefaultChecks will skip built-in checks for systemd unit failures and SELinux AVC denials, etc.
+	SuppressDefaultChecks bool
 }
 
 // RuntimeConfig contains cluster-specific configuration.
@@ -435,20 +437,46 @@ func CheckMachine(ctx context.Context, m Machine) error {
 		return fmt.Errorf("not a supported instance: %v", string(out))
 	}
 
-	if !m.RuntimeConf().AllowFailedUnits {
-		// ensure no systemd units failed during boot
-		out, stderr, err = m.SSH("systemctl --no-legend --state failed list-units")
-		if err != nil {
-			return fmt.Errorf("systemctl: %s: %v: %s", out, err, stderr)
-		}
-		if len(out) > 0 {
-			return fmt.Errorf("some systemd units failed:\n%s", out)
-		}
+	// Validate that nothing is wrong before we even run the test
+	if err := CheckMachineBasics(ctx, m); err != nil {
+		return err
 	}
 
 	return ctx.Err()
 }
 
+// CheckMachineBasics validates that no systemd units have
+// failed and no SELinux AVC denials were logged.  It may be
+// extended in the future - the idea is these are "baseline"
+// errors that shouldn't be seen across any tests.
+func CheckMachineBasics(ctx context.Context, m Machine) error {
+	if m.RuntimeConf().AllowFailedUnits {
+		plog.Debug("Allowing failed units")
+		return nil
+	}
+
+	// ensure no systemd units failed during boot
+	out, stderr, err := m.SSH("systemctl --no-legend --state failed list-units")
+	if err != nil {
+		return fmt.Errorf("systemctl: %s: %v: %s", out, err, stderr)
+	}
+	if len(out) > 0 {
+		return fmt.Errorf("some systemd units failed:\n%s", out)
+	}
+	// Also no SELinux denials; RHCOS currently ships auditd, FCOS doesn't, so handle
+	// both places.
+	out, stderr, err = m.SSH("if test -f /var/log/audit/audit.log; then grep 'avc.*denied' /var/log/audit/audit.log || true; else journalctl -q --no-pager --grep='avc.*denied' _TRANSPORT=audit || true; fi")
+	if err != nil {
+		return fmt.Errorf("failed to query audit logs: %s: %v: %s", out, err, stderr)
+	}
+	if len(out) > 0 {
+		return fmt.Errorf("Found SELinux AVC denials:\n%s", out)
+	}
+	plog.Debug("Validated machine")
+
+	return nil
+}
+
 type machineInfo struct {
 	Id        string `json:"id"`
 	PublicIp  string `json:"public_ip"`

diff --git a/src/cmd-kola b/src/cmd-kola
@@ -8,6 +8,9 @@ import platform
 import sys
 import yaml
 
+# Special pattern that skips systemd unit checks/SELinux AVCs for all tests
+SPECIAL_DEFAULT_CHECKS = 'default-checks'
+
 # Just test these boot to start with.  In the future we should at least
 # do ostree upgrades with uefi etc.  But we don't really need the *full*
 # suite...if podman somehow broke with nvme or uefi I'd be amazed and impressed.
@@ -57,7 +60,11 @@ try:
                 continue
             print(f"⚠️  Skipping kola test pattern \"{obj['pattern']}\":")
             print(f"⚠️  {obj['tracker']}")
-            kolaargs.extend(['--denylist-test', obj['pattern']])
+            pattern = obj['pattern']
+            if pattern == SPECIAL_DEFAULT_CHECKS:
+                kolaargs.append('--no-default-checks')
+            else:
+                kolaargs.extend(['--denylist-test', pattern])
 except FileNotFoundError:
     pass
 

diff --git a/tests/kola-ci-unit-failure/tests/kola/unit-failure-skip b/tests/kola-ci-unit-failure/tests/kola/unit-failure-skip
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Intentionally cause a systemd unit failure that should be skipped with --no-default-checks
+set -xeuo pipefail
+systemd-run --wait --unit should-fail.service -q false || true
+echo "ok started failed service"
+