Skip to content

Commit

Permalink
v2: Fix inotify leak when cgroup is deleted
Browse files Browse the repository at this point in the history
When running on cgroup2, we currently leak the inotify instance (and goroutine
blocked on read) used to monitor 'memory.events' on any container exit. This is
highly problematic when containers are automatically restarted because we will
exhaust either the fd limit or system-wide inotify instance limit.

When a process exits, there is no memory event and even when the cgroup is
deleted, the inotify read is also not unblocked. This is not the case when
containerd is running on cgroup (v1) because that uses a different mechanism
for notification and detects cgroup deletion.

Fulfill the contract on cgroup2 by additionally monitoring cgroup.events for
process exit. When the last process exits the kernel signals an event on
'cgroup.events'. For robustness we check both 'cgroup.events' and
'memory.events' on any notification and also handle ENOENT/ENODEV errors from
read/open of 'memory.events'.

We signal exit up the stack by closing the error channel. Strangely, the error
channel was not previously being returned to the caller.

Signed-off-by: Jeremi Piotrowski <[email protected]>
  • Loading branch information
jepio committed Dec 8, 2021
1 parent 2b95ef0 commit beb8965
Showing 1 changed file with 39 additions and 9 deletions.
48 changes: 39 additions & 9 deletions v2/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,23 @@ func (c *Manager) freeze(path string, state State) error {
}
}

func (c *Manager) isCgroupEmpty() bool {
// In case of any error we return true so that we exit and don't leak resources
out := make(map[string]interface{})
err := readKVStatsFile(c.path, "cgroup.events", out)
if err != nil {
return true
}
if v, ok := out["populated"]; ok {
populated, ok := v.(uint64)
if !ok {
return true
}
return populated == 0
}
return true
}

// MemoryEventFD returns inotify file descriptor and 'memory.events' inotify watch descriptor
func (c *Manager) MemoryEventFD() (int, uint32, error) {
fpath := filepath.Join(c.path, "memory.events")
Expand All @@ -568,32 +585,38 @@ func (c *Manager) MemoryEventFD() (int, uint32, error) {
return 0, 0, errors.New("failed to create inotify fd")
}
wd, err := syscall.InotifyAddWatch(fd, fpath, unix.IN_MODIFY)
if wd < 0 {
if err != nil {
syscall.Close(fd)
return 0, 0, fmt.Errorf("failed to add inotify watch for %q", fpath)
return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", fpath, err)
}
// monitor to detect process exit/cgroup deletion
evpath := filepath.Join(c.path, "cgroup.events")
_, err = syscall.InotifyAddWatch(fd, evpath, unix.IN_MODIFY)
if err != nil {
syscall.Close(fd)
return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", evpath, err)
}

return fd, uint32(wd), nil
}

func (c *Manager) EventChan() (<-chan Event, <-chan error) {
ec := make(chan Event)
errCh := make(chan error)
errCh := make(chan error, 1)
go c.waitForEvents(ec, errCh)

return ec, nil
return ec, errCh
}

func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) {
fd, wd, err := c.MemoryEventFD()

defer syscall.InotifyRmWatch(fd, wd)
defer syscall.Close(fd)
defer close(errCh)

fd, _, err := c.MemoryEventFD()
if err != nil {
errCh <- err
return
}
defer syscall.Close(fd)

for {
buffer := make([]byte, syscall.SizeofInotifyEvent*10)
Expand Down Expand Up @@ -643,7 +666,14 @@ func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) {
}
ec <- e
} else {
errCh <- err
// When cgroup is deleted read may return -ENODEV instead of -ENOENT from open
_, statErr := os.Lstat(filepath.Join(c.path, "memory.events"))
if !os.IsNotExist(statErr) {
errCh <- err
}
return
}
if c.isCgroupEmpty() {
return
}
}
Expand Down

0 comments on commit beb8965

Please sign in to comment.