Skip to content

Commit

Permalink
Make /proc/pid/io lookup failures skippable (#129)
Browse files Browse the repository at this point in the history
## What does this PR do?

This fixes a change in the behavior of the system metrics where fetching
process metrics on a container without `SYS_PTRACE` results in a
failure, as `/proc/pid/io` requires `SYS_PTRACE`.

This flies under the radar in a handful of use cases, as our docs
mention `SYS_PTRACE` in other contexts, and many users monitoring system
data will (in my experience) end up passing `--privileged` due to [a
variety of issues with permissions in
`proc`](https://stackoverflow.com/questions/56573990/access-to-full-proc-in-docker-container).
This issue also doesn't appear outside of docker.

There is a test already that does cover this (although it'll catch it
for a permissions error, not a container capability error):
`TestFetchProcessFromOtherUser`, however, that test is usually skipped
in CI, as the CI environment lacks more than one user. The exact test
condition itself can't really be tested without running in a container.

If we want to test this more thoroughly, we'll need some kind of
integration test in beats that runs in a container and specifically
looks out for skipped processes. These kinds of tests are often flaky,
so they tend not to happen. Once we get buildkite set up in the beats
repo, we should add a test that specifically looks for skipped metrics,
probably by spinning up multiple sub-processes and having beats monitor
them directly.

## Why is it important?

This changes the behavior of system metrics inside a container.

## Checklist

- [x] My code follows the style guidelines of this project
- [x] I have commented my code, particularly in hard-to-understand areas
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have added an entry in `CHANGELOG.md`
  • Loading branch information
fearful-symmetry authored Mar 6, 2024
1 parent 7f008f4 commit 709b5c4
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).

- Fix CmdLine generation and caching for system.process
- Fix process metrics collection when both cgroup V1 and V2 controllers exist
- Skip permissions errors when reading /proc/pid/io

## [0.7.0]

Expand Down
5 changes: 4 additions & 1 deletion metric/system/process/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,10 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) {
// If we've passed the filter, continue to fill out the rest of the metrics
status, err = FillPidMetrics(procStats.Hostfs, pid, status, procStats.isWhitelistedEnvVar)
if err != nil {
return status, true, fmt.Errorf("FillPidMetrics: %w", err)
if !errors.Is(err, NonFatalErr{}) {
return status, true, fmt.Errorf("FillPidMetrics: %w", err)
}
procStats.logger.Debugf("Non-fatal error fetching PID metrics for %d, metrics are valid, but partial: %s", pid, err)
}

if status.CPU.Total.Ticks.Exists() {
Expand Down
3 changes: 2 additions & 1 deletion metric/system/process/process_linux_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,10 @@ func FillPidMetrics(hostfs resolve.Resolver, pid int, state ProcState, filter fu
return state, fmt.Errorf("error creating username for pid %d: %w", pid, err)
}

// the /proc/[pid]/io metrics require SYS_PTRACE when run from inside docker
state.IO, err = getIOData(hostfs, pid)
if err != nil {
return state, fmt.Errorf("error fetching IO metrics for pid %d: %w", pid, err)
return state, NonFatalErr{Err: fmt.Errorf("/io unavailable; if running inside a container, use SYS_PTRACE: %w", err)}
}

return state, nil
Expand Down
100 changes: 100 additions & 0 deletions metric/system/process/process_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,124 @@
package process

import (
"errors"
"fmt"
"os"
"os/exec"
"os/user"
"strconv"
"strings"
"syscall"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/elastic/elastic-agent-libs/logp"
"github.com/elastic/elastic-agent-libs/opt"
"github.com/elastic/elastic-agent-system-metrics/metric/system/cgroup"
"github.com/elastic/elastic-agent-system-metrics/metric/system/resolve"
)

// CreateUser creates a user on the machine.
func CreateUser(name string, gid int) (int, error) {
args := []string{
"--gid", strconv.Itoa(gid),
"--system",
"--no-user-group",
"--shell", "/usr/bin/false",
name,
}
cmd := exec.Command("useradd", args...)
output, err := cmd.CombinedOutput()
if err != nil {
command := fmt.Sprintf("useradd %s", strings.Join(args, " "))
return -1, fmt.Errorf("%s failed: %w (output: %s)", command, err, output)
}
return FindUID(name)
}

// FindUID returns the user's UID on the machine.
func FindUID(name string) (int, error) {
id, err := getentGetID("passwd", name)
if e := (&exec.ExitError{}); errors.As(err, &e) {
if e.ExitCode() == 2 {
// exit code 2 is the key doesn't exist in the database
return -1, fmt.Errorf("User not found")
}
}
return id, err
}

// helper to get a passwd entry for a user
func getentGetID(database string, key string) (int, error) {
cmd := exec.Command("getent", database, key)
output, err := cmd.Output()
if err != nil {
return -1, fmt.Errorf("getent %s %s failed: %w (output: %s)", database, key, err, output)
}
split := strings.Split(string(output), ":")
if len(split) < 3 {
return -1, fmt.Errorf("unexpected format: %s", output)
}
val, err := strconv.Atoi(split[2])
if err != nil {
return -1, fmt.Errorf("failed to convert %s to int: %w", split[2], err)
}
return val, nil
}

func TestRunningProcessFromOtherUser(t *testing.T) {
// test for permission errors by creating a new user, then running a process as that user
testUsername := "test"
uid, err := CreateUser(testUsername, 0)
require.NoError(t, err)
t.Logf("uid: %v", uid)

t.Cleanup(func() {
// not sure how ephemeral the CI environment is, but delete the user anyway
cmd := exec.Command("userdel", "-f", testUsername)
output, err := cmd.CombinedOutput()
require.NoError(t, err, "got error deleting user: %s", string(output))
})

cmdHandler := exec.Command("sleep", "60")
cmdHandler.SysProcAttr = &syscall.SysProcAttr{Credential: &syscall.Credential{Uid: uint32(uid), Gid: 0}}

err = cmdHandler.Start()
require.NoError(t, err)
runPid := cmdHandler.Process.Pid

testStats := Stats{CPUTicks: true,
EnableCgroups: true,
EnableNetwork: true,
Hostfs: resolve.NewTestResolver("/"),
Procs: []string{".*"},
CgroupOpts: cgroup.ReaderOptions{RootfsMountpoint: resolve.NewTestResolver("/")},
}
err = testStats.Init()
require.NoError(t, err)

uname, err := user.Current()
require.NoError(t, err)

result, err := testStats.GetOne(runPid)
require.NoError(t, err)
// check to make sure we still got valid results
require.Equal(t, "sleep 60", result["cmdline"])
require.NotEqual(t, uname.Name, result["username"])
require.NotZero(t, result["memory"].(map[string]interface{})["size"])
t.Logf("got result: %s", result["username"])

}

func TestFetchProcessFromOtherUser(t *testing.T) {

// If we just used Get() or FetchPids() to get a list of processes on the system, this would produce a bootstrapping problem
// where if the code wasn't working (and we were skipping over PIDs not owned by us) this test would pass.
// re-implement part of the core pid-fetch logic
// All this does is find a pid that's not owned by us.
_ = logp.DevelopmentSetup()
dir, err := os.Open("/proc/")
require.NoError(t, err, "error opening /proc")

Expand Down

0 comments on commit 709b5c4

Please sign in to comment.