From cea4f3eb00e58ac0ab2bf22ad8dd5de45712b866 Mon Sep 17 00:00:00 2001 From: Alex K <8418476+fearful-symmetry@users.noreply.github.com> Date: Wed, 1 May 2024 09:07:02 -0700 Subject: [PATCH] Fix exe error checks (#148) ## What does this PR do? Closes https://github.com/elastic/elastic-agent-system-metrics/issues/135 This fixes an issue where our permissions checks for fetching `/proc/pid/exe` could fail for kernel procs and certain docker configs. Also re-enables one of the tests. ## Why is it important? This is a bug that causes data loss. ## Checklist - [x] My code follows the style guidelines of this project - [x] I have commented my code, particularly in hard-to-understand areas - [x] I have added tests that prove my fix is effective or that my feature works - [ ] I have added an entry in `CHANGELOG.md` --- metric/system/process/process_container_test.go | 6 +++++- metric/system/process/process_linux_common.go | 8 ++++++-- tests/container_system_mon_test.go | 8 ++++++-- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/metric/system/process/process_container_test.go b/metric/system/process/process_container_test.go index eacd979c3b..63de01c356 100644 --- a/metric/system/process/process_container_test.go +++ b/metric/system/process/process_container_test.go @@ -90,8 +90,12 @@ func validateProcResult(t *testing.T, result mapstr.M) { require.NoError(t, err) gotUser, _ := result["username"].(string) + gotPpid, ok := result["ppid"].(int) + require.True(t, ok) + // if we're root or the same user as the pid, check `exe` - if privilegedMode && (os.Getuid() == 0 || user.Name == gotUser) { + // kernel procs also don't have `exe` + if (privilegedMode && (os.Getuid() == 0 || user.Name == gotUser)) && gotPpid != 2 { exe := result["exe"] require.NotNil(t, exe) } diff --git a/metric/system/process/process_linux_common.go b/metric/system/process/process_linux_common.go index b494cbfcb5..81575ffd81 100644 --- a/metric/system/process/process_linux_common.go +++ b/metric/system/process/process_linux_common.go @@ -115,7 +115,11 @@ func FillPidMetrics(hostfs resolve.Resolver, pid int, state ProcState, filter fu } state.Exe, state.Cwd, err = getProcStringData(hostfs, pid) - if err != nil && !errors.Is(err, os.ErrPermission) { // ignore permission errors + // skip permission errors and file not found errors + // see https://github.com/elastic/elastic-agent-system-metrics/issues/135 for a bit more context, + // depending on the permissions/caps that this is running with, the /exe symlink may have different levels of permission restrictions. + // A kernel proc will also return file not found. + if err != nil && !errors.Is(err, os.ErrPermission) && !errors.Is(err, os.ErrNotExist) { // ignore permission errors return state, fmt.Errorf("error getting metadata for pid %d: %w", pid, err) } @@ -206,7 +210,7 @@ func parseProcStat(data []byte) (ProcState, error) { func getProcStringData(hostfs resolve.Resolver, pid int) (string, string, error) { exe, err := os.Readlink(hostfs.Join("proc", strconv.Itoa(pid), "exe")) - if errors.Is(err, os.ErrPermission) { // pass through permission errors + if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) { // pass through errors return "", "", err } else if err != nil { return "", "", fmt.Errorf("error fetching exe from pid %d: %w", pid, err) diff --git a/tests/container_system_mon_test.go b/tests/container_system_mon_test.go index 734e22e160..5968062c95 100644 --- a/tests/container_system_mon_test.go +++ b/tests/container_system_mon_test.go @@ -22,6 +22,7 @@ import ( "os" "os/exec" "path/filepath" + "runtime" "strconv" "strings" "testing" @@ -37,7 +38,9 @@ import ( // These tests are designed for the case of monitoring a host system from inside docker via a /hostfs func TestKernelProc(t *testing.T) { - t.Skip("This test will fail until https://github.com/elastic/elastic-agent-system-metrics/issues/135 is fixed") + if runtime.GOOS != "linux" { + t.Skip("test is linux-only") + } _ = logp.DevelopmentSetup() //manually fetch a kernel process // kernel processes will have a parent pid of 2 @@ -59,7 +62,7 @@ func TestKernelProc(t *testing.T) { statPart := strings.Split(string(statRaw), " ") ppid := statPart[3] if ppid == "2" { - testPid, err = strconv.ParseInt(ppid, 10, 64) + testPid, err = strconv.ParseInt(statPart[0], 10, 64) require.NoError(t, err) break } @@ -69,6 +72,7 @@ func TestKernelProc(t *testing.T) { t.Skip("could not find kernel process") } + t.Logf("monitoring kernel proc %d", testPid) ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) defer cancel()