diff --git a/CHANGELOG.md b/CHANGELOG.md index ec5c10fb4c..9b631f2faf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ This project adheres to [Semantic Versioning](http://semver.org/). - Fix CmdLine generation and caching for system.process - Fix process metrics collection when both cgroup V1 and V2 controllers exist + - Skip permissions errors when reading /proc/pid/io ## [0.7.0] diff --git a/metric/system/process/process.go b/metric/system/process/process.go index 0bdc05b061..b4ab6373e3 100644 --- a/metric/system/process/process.go +++ b/metric/system/process/process.go @@ -237,7 +237,10 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) { // If we've passed the filter, continue to fill out the rest of the metrics status, err = FillPidMetrics(procStats.Hostfs, pid, status, procStats.isWhitelistedEnvVar) if err != nil { - return status, true, fmt.Errorf("FillPidMetrics: %w", err) + if !errors.Is(err, NonFatalErr{}) { + return status, true, fmt.Errorf("FillPidMetrics: %w", err) + } + procStats.logger.Debugf("Non-fatal error fetching PID metrics for %d, metrics are valid, but partial: %s", pid, err) } if status.CPU.Total.Ticks.Exists() { diff --git a/metric/system/process/process_linux_common.go b/metric/system/process/process_linux_common.go index e5a4a498ee..b494cbfcb5 100644 --- a/metric/system/process/process_linux_common.go +++ b/metric/system/process/process_linux_common.go @@ -124,9 +124,10 @@ func FillPidMetrics(hostfs resolve.Resolver, pid int, state ProcState, filter fu return state, fmt.Errorf("error creating username for pid %d: %w", pid, err) } + // the /proc/[pid]/io metrics require SYS_PTRACE when run from inside docker state.IO, err = getIOData(hostfs, pid) if err != nil { - return state, fmt.Errorf("error fetching IO metrics for pid %d: %w", pid, err) + return state, NonFatalErr{Err: fmt.Errorf("/io unavailable; if running inside a container, use SYS_PTRACE: %w", err)} } return state, nil diff --git a/metric/system/process/process_linux_test.go b/metric/system/process/process_linux_test.go index dbd36d96af..bac08a6225 100644 --- a/metric/system/process/process_linux_test.go +++ b/metric/system/process/process_linux_test.go @@ -20,24 +20,124 @@ package process import ( + "errors" + "fmt" "os" + "os/exec" "os/user" "strconv" + "strings" + "syscall" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent-libs/opt" "github.com/elastic/elastic-agent-system-metrics/metric/system/cgroup" "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" ) +// CreateUser creates a user on the machine. +func CreateUser(name string, gid int) (int, error) { + args := []string{ + "--gid", strconv.Itoa(gid), + "--system", + "--no-user-group", + "--shell", "/usr/bin/false", + name, + } + cmd := exec.Command("useradd", args...) + output, err := cmd.CombinedOutput() + if err != nil { + command := fmt.Sprintf("useradd %s", strings.Join(args, " ")) + return -1, fmt.Errorf("%s failed: %w (output: %s)", command, err, output) + } + return FindUID(name) +} + +// FindUID returns the user's UID on the machine. +func FindUID(name string) (int, error) { + id, err := getentGetID("passwd", name) + if e := (&exec.ExitError{}); errors.As(err, &e) { + if e.ExitCode() == 2 { + // exit code 2 is the key doesn't exist in the database + return -1, fmt.Errorf("User not found") + } + } + return id, err +} + +// helper to get a passwd entry for a user +func getentGetID(database string, key string) (int, error) { + cmd := exec.Command("getent", database, key) + output, err := cmd.Output() + if err != nil { + return -1, fmt.Errorf("getent %s %s failed: %w (output: %s)", database, key, err, output) + } + split := strings.Split(string(output), ":") + if len(split) < 3 { + return -1, fmt.Errorf("unexpected format: %s", output) + } + val, err := strconv.Atoi(split[2]) + if err != nil { + return -1, fmt.Errorf("failed to convert %s to int: %w", split[2], err) + } + return val, nil +} + +func TestRunningProcessFromOtherUser(t *testing.T) { + // test for permission errors by creating a new user, then running a process as that user + testUsername := "test" + uid, err := CreateUser(testUsername, 0) + require.NoError(t, err) + t.Logf("uid: %v", uid) + + t.Cleanup(func() { + // not sure how ephemeral the CI environment is, but delete the user anyway + cmd := exec.Command("userdel", "-f", testUsername) + output, err := cmd.CombinedOutput() + require.NoError(t, err, "got error deleting user: %s", string(output)) + }) + + cmdHandler := exec.Command("sleep", "60") + cmdHandler.SysProcAttr = &syscall.SysProcAttr{Credential: &syscall.Credential{Uid: uint32(uid), Gid: 0}} + + err = cmdHandler.Start() + require.NoError(t, err) + runPid := cmdHandler.Process.Pid + + testStats := Stats{CPUTicks: true, + EnableCgroups: true, + EnableNetwork: true, + Hostfs: resolve.NewTestResolver("/"), + Procs: []string{".*"}, + CgroupOpts: cgroup.ReaderOptions{RootfsMountpoint: resolve.NewTestResolver("/")}, + } + err = testStats.Init() + require.NoError(t, err) + + uname, err := user.Current() + require.NoError(t, err) + + result, err := testStats.GetOne(runPid) + require.NoError(t, err) + // check to make sure we still got valid results + require.Equal(t, "sleep 60", result["cmdline"]) + require.NotEqual(t, uname.Name, result["username"]) + require.NotZero(t, result["memory"].(map[string]interface{})["size"]) + t.Logf("got result: %s", result["username"]) + +} + func TestFetchProcessFromOtherUser(t *testing.T) { + // If we just used Get() or FetchPids() to get a list of processes on the system, this would produce a bootstrapping problem // where if the code wasn't working (and we were skipping over PIDs not owned by us) this test would pass. // re-implement part of the core pid-fetch logic // All this does is find a pid that's not owned by us. + _ = logp.DevelopmentSetup() dir, err := os.Open("/proc/") require.NoError(t, err, "error opening /proc")