Skip to content

Commit

Permalink
Fix exe error checks (#148)
Browse files Browse the repository at this point in the history
## What does this PR do?

Closes
#135

This fixes an issue where our permissions checks for fetching
`/proc/pid/exe` could fail for kernel procs and certain docker configs.

Also re-enables one of the tests.

## Why is it important?

This is a bug that causes data loss.

## Checklist


- [x] My code follows the style guidelines of this project
- [x] I have commented my code, particularly in hard-to-understand areas
- [x] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have added an entry in `CHANGELOG.md`
  • Loading branch information
fearful-symmetry authored May 1, 2024
1 parent 15b496a commit cea4f3e
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 5 deletions.
6 changes: 5 additions & 1 deletion metric/system/process/process_container_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,12 @@ func validateProcResult(t *testing.T, result mapstr.M) {
require.NoError(t, err)
gotUser, _ := result["username"].(string)

gotPpid, ok := result["ppid"].(int)
require.True(t, ok)

// if we're root or the same user as the pid, check `exe`
if privilegedMode && (os.Getuid() == 0 || user.Name == gotUser) {
// kernel procs also don't have `exe`
if (privilegedMode && (os.Getuid() == 0 || user.Name == gotUser)) && gotPpid != 2 {
exe := result["exe"]
require.NotNil(t, exe)
}
Expand Down
8 changes: 6 additions & 2 deletions metric/system/process/process_linux_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,11 @@ func FillPidMetrics(hostfs resolve.Resolver, pid int, state ProcState, filter fu
}

state.Exe, state.Cwd, err = getProcStringData(hostfs, pid)
if err != nil && !errors.Is(err, os.ErrPermission) { // ignore permission errors
// skip permission errors and file not found errors
// see https://github.com/elastic/elastic-agent-system-metrics/issues/135 for a bit more context,
// depending on the permissions/caps that this is running with, the /exe symlink may have different levels of permission restrictions.
// A kernel proc will also return file not found.
if err != nil && !errors.Is(err, os.ErrPermission) && !errors.Is(err, os.ErrNotExist) { // ignore permission errors
return state, fmt.Errorf("error getting metadata for pid %d: %w", pid, err)
}

Expand Down Expand Up @@ -206,7 +210,7 @@ func parseProcStat(data []byte) (ProcState, error) {

func getProcStringData(hostfs resolve.Resolver, pid int) (string, string, error) {
exe, err := os.Readlink(hostfs.Join("proc", strconv.Itoa(pid), "exe"))
if errors.Is(err, os.ErrPermission) { // pass through permission errors
if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) { // pass through errors
return "", "", err
} else if err != nil {
return "", "", fmt.Errorf("error fetching exe from pid %d: %w", pid, err)
Expand Down
8 changes: 6 additions & 2 deletions tests/container_system_mon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"testing"
Expand All @@ -37,7 +38,9 @@ import (
// These tests are designed for the case of monitoring a host system from inside docker via a /hostfs

func TestKernelProc(t *testing.T) {
t.Skip("This test will fail until https://github.com/elastic/elastic-agent-system-metrics/issues/135 is fixed")
if runtime.GOOS != "linux" {
t.Skip("test is linux-only")
}
_ = logp.DevelopmentSetup()
//manually fetch a kernel process
// kernel processes will have a parent pid of 2
Expand All @@ -59,7 +62,7 @@ func TestKernelProc(t *testing.T) {
statPart := strings.Split(string(statRaw), " ")
ppid := statPart[3]
if ppid == "2" {
testPid, err = strconv.ParseInt(ppid, 10, 64)
testPid, err = strconv.ParseInt(statPart[0], 10, 64)
require.NoError(t, err)
break
}
Expand All @@ -69,6 +72,7 @@ func TestKernelProc(t *testing.T) {
t.Skip("could not find kernel process")
}

t.Logf("monitoring kernel proc %d", testPid)
ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5)
defer cancel()

Expand Down

0 comments on commit cea4f3e

Please sign in to comment.