From 709b5c4192253a837894b2a25d2676d2dc4cbdc8 Mon Sep 17 00:00:00 2001
From: Alex K <8418476+fearful-symmetry@users.noreply.github.com>
Date: Wed, 6 Mar 2024 10:01:23 -0800
Subject: [PATCH] Make /proc/pid/io lookup failures skippable (#129)

## What does this PR do?

This fixes a change in the behavior of the system metrics where fetching
process metrics on a container without `SYS_PTRACE` results in a
failure, as `/proc/pid/io` requires `SYS_PTRACE`.

This flies under the radar in a handful of use cases, as our docs
mention `SYS_PTRACE` in other contexts, and many users monitoring system
data will (in my experience) end up passing `--privileged` due to [a
variety of issues with permissions in
`proc`](https://stackoverflow.com/questions/56573990/access-to-full-proc-in-docker-container).
This issue also doesn't appear outside of docker.

There is a test already that does cover this (although it'll catch it
for a permissions error, not a container capability error):
`TestFetchProcessFromOtherUser`, however, that test is usually skipped
in CI, as the CI environment lacks more than one user. The exact test
condition itself can't really be tested without running in a container.

If we want to test this more thoroughly, we'll need some kind of
integration test in beats that runs in a container and specifically
looks out for skipped processes. These kinds of tests are often flaky,
so they tend not to happen. Once we get buildkite set up in the beats
repo, we should add a test that specifically looks for skipped metrics,
probably by spinning up multiple sub-processes and having beats monitor
them directly.

## Why is it important?

This changes the behavior of system metrics inside a container.

## Checklist

- [x] My code follows the style guidelines of this project
- [x] I have commented my code, particularly in hard-to-understand areas
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have added an entry in `CHANGELOG.md`
---
 CHANGELOG.md                                  |   1 +
 metric/system/process/process.go              |   5 +-
 metric/system/process/process_linux_common.go |   3 +-
 metric/system/process/process_linux_test.go   | 100 ++++++++++++++++++
 4 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ec5c10fb4c..9b631f2faf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 
  - Fix CmdLine generation and caching for system.process
  - Fix process metrics collection when both cgroup V1 and V2 controllers exist
+ - Skip permissions errors when reading /proc/pid/io
 
 ## [0.7.0]
 
diff --git a/metric/system/process/process.go b/metric/system/process/process.go
index 0bdc05b061..b4ab6373e3 100644
--- a/metric/system/process/process.go
+++ b/metric/system/process/process.go
@@ -237,7 +237,10 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) {
 	// If we've passed the filter, continue to fill out the rest of the metrics
 	status, err = FillPidMetrics(procStats.Hostfs, pid, status, procStats.isWhitelistedEnvVar)
 	if err != nil {
-		return status, true, fmt.Errorf("FillPidMetrics: %w", err)
+		if !errors.Is(err, NonFatalErr{}) {
+			return status, true, fmt.Errorf("FillPidMetrics: %w", err)
+		}
+		procStats.logger.Debugf("Non-fatal error fetching PID metrics for %d, metrics are valid, but partial: %s", pid, err)
 	}
 
 	if status.CPU.Total.Ticks.Exists() {
diff --git a/metric/system/process/process_linux_common.go b/metric/system/process/process_linux_common.go
index e5a4a498ee..b494cbfcb5 100644
--- a/metric/system/process/process_linux_common.go
+++ b/metric/system/process/process_linux_common.go
@@ -124,9 +124,10 @@ func FillPidMetrics(hostfs resolve.Resolver, pid int, state ProcState, filter fu
 		return state, fmt.Errorf("error creating username for pid %d: %w", pid, err)
 	}
 
+	// the /proc/[pid]/io metrics require SYS_PTRACE when run from inside docker
 	state.IO, err = getIOData(hostfs, pid)
 	if err != nil {
-		return state, fmt.Errorf("error fetching IO metrics for pid %d: %w", pid, err)
+		return state, NonFatalErr{Err: fmt.Errorf("/io unavailable; if running inside a container, use SYS_PTRACE: %w", err)}
 	}
 
 	return state, nil
diff --git a/metric/system/process/process_linux_test.go b/metric/system/process/process_linux_test.go
index dbd36d96af..bac08a6225 100644
--- a/metric/system/process/process_linux_test.go
+++ b/metric/system/process/process_linux_test.go
@@ -20,24 +20,124 @@
 package process
 
 import (
+	"errors"
+	"fmt"
 	"os"
+	"os/exec"
 	"os/user"
 	"strconv"
+	"strings"
+	"syscall"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
+	"github.com/elastic/elastic-agent-libs/logp"
 	"github.com/elastic/elastic-agent-libs/opt"
 	"github.com/elastic/elastic-agent-system-metrics/metric/system/cgroup"
 	"github.com/elastic/elastic-agent-system-metrics/metric/system/resolve"
 )
 
+// CreateUser creates a user on the machine.
+func CreateUser(name string, gid int) (int, error) {
+	args := []string{
+		"--gid", strconv.Itoa(gid),
+		"--system",
+		"--no-user-group",
+		"--shell", "/usr/bin/false",
+		name,
+	}
+	cmd := exec.Command("useradd", args...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		command := fmt.Sprintf("useradd %s", strings.Join(args, " "))
+		return -1, fmt.Errorf("%s failed: %w (output: %s)", command, err, output)
+	}
+	return FindUID(name)
+}
+
+// FindUID returns the user's UID on the machine.
+func FindUID(name string) (int, error) {
+	id, err := getentGetID("passwd", name)
+	if e := (&exec.ExitError{}); errors.As(err, &e) {
+		if e.ExitCode() == 2 {
+			// exit code 2 is the key doesn't exist in the database
+			return -1, fmt.Errorf("User not found")
+		}
+	}
+	return id, err
+}
+
+// helper to get a passwd entry for a user
+func getentGetID(database string, key string) (int, error) {
+	cmd := exec.Command("getent", database, key)
+	output, err := cmd.Output()
+	if err != nil {
+		return -1, fmt.Errorf("getent %s %s failed: %w (output: %s)", database, key, err, output)
+	}
+	split := strings.Split(string(output), ":")
+	if len(split) < 3 {
+		return -1, fmt.Errorf("unexpected format: %s", output)
+	}
+	val, err := strconv.Atoi(split[2])
+	if err != nil {
+		return -1, fmt.Errorf("failed to convert %s to int: %w", split[2], err)
+	}
+	return val, nil
+}
+
+func TestRunningProcessFromOtherUser(t *testing.T) {
+	// test for permission errors by creating a new user, then running a process as that user
+	testUsername := "test"
+	uid, err := CreateUser(testUsername, 0)
+	require.NoError(t, err)
+	t.Logf("uid: %v", uid)
+
+	t.Cleanup(func() {
+		// not sure how ephemeral the CI environment is, but delete the user anyway
+		cmd := exec.Command("userdel", "-f", testUsername)
+		output, err := cmd.CombinedOutput()
+		require.NoError(t, err, "got error deleting user: %s", string(output))
+	})
+
+	cmdHandler := exec.Command("sleep", "60")
+	cmdHandler.SysProcAttr = &syscall.SysProcAttr{Credential: &syscall.Credential{Uid: uint32(uid), Gid: 0}}
+
+	err = cmdHandler.Start()
+	require.NoError(t, err)
+	runPid := cmdHandler.Process.Pid
+
+	testStats := Stats{CPUTicks: true,
+		EnableCgroups: true,
+		EnableNetwork: true,
+		Hostfs:        resolve.NewTestResolver("/"),
+		Procs:         []string{".*"},
+		CgroupOpts:    cgroup.ReaderOptions{RootfsMountpoint: resolve.NewTestResolver("/")},
+	}
+	err = testStats.Init()
+	require.NoError(t, err)
+
+	uname, err := user.Current()
+	require.NoError(t, err)
+
+	result, err := testStats.GetOne(runPid)
+	require.NoError(t, err)
+	// check to make sure we still got valid results
+	require.Equal(t, "sleep 60", result["cmdline"])
+	require.NotEqual(t, uname.Name, result["username"])
+	require.NotZero(t, result["memory"].(map[string]interface{})["size"])
+	t.Logf("got result: %s", result["username"])
+
+}
+
 func TestFetchProcessFromOtherUser(t *testing.T) {
+
 	// If we just used Get() or FetchPids() to get a list of processes on the system, this would produce a bootstrapping problem
 	// where if the code wasn't working (and we were skipping over PIDs not owned by us) this test would pass.
 	// re-implement part of the core pid-fetch logic
 	// All this does is find a pid that's not owned by us.
+	_ = logp.DevelopmentSetup()
 	dir, err := os.Open("/proc/")
 	require.NoError(t, err, "error opening /proc")