Skip to content

Commit

Permalink
Add a backoff and retries to retrieving exited event
Browse files Browse the repository at this point in the history
There's a potential race around extremely short-running
containers and events with journald. Events may not be written
for some time (small, but appreciable) after they are received,
and as such we can fail to retrieve it if there is a sufficiently
short time between us writing the event and trying to read it.

Work around this by just retrying, with a 0.25 second delay
between retries, up to 4 times.

[NO TESTS NEEDED] because I have no idea how to reproduce this
race in CI.

Fixes #11633

Signed-off-by: Matthew Heon <[email protected]>
  • Loading branch information
mheon committed Sep 22, 2021
1 parent 70da2e8 commit d45cbbd
Showing 1 changed file with 25 additions and 30 deletions.
55 changes: 25 additions & 30 deletions pkg/domain/infra/abi/containers.go
Original file line number Diff line number Diff line change
Expand Up @@ -830,21 +830,7 @@ func (ic *ContainerEngine) ContainerStart(ctx context.Context, namesOrIds []stri
}
return reports, errors.Wrapf(err, "unable to start container %s", ctr.ID())
}

if ecode, err := ctr.Wait(ctx); err != nil {
if errors.Cause(err) == define.ErrNoSuchCtr {
// Check events
event, err := ic.Libpod.GetLastContainerEvent(ctx, ctr.ID(), events.Exited)
if err != nil {
logrus.Errorf("Cannot get exit code: %v", err)
exitCode = define.ExecErrorCodeNotFound
} else {
exitCode = event.ContainerExitCode
}
}
} else {
exitCode = int(ecode)
}
exitCode = ic.GetContainerExitCode(ctx, ctr)
reports = append(reports, &entities.ContainerStartReport{
Id: ctr.ID(),
RawInput: rawInput,
Expand Down Expand Up @@ -985,21 +971,7 @@ func (ic *ContainerEngine) ContainerRun(ctx context.Context, opts entities.Conta
report.ExitCode = define.ExitCode(err)
return &report, err
}

if ecode, err := ctr.Wait(ctx); err != nil {
if errors.Cause(err) == define.ErrNoSuchCtr {
// Check events
event, err := ic.Libpod.GetLastContainerEvent(ctx, ctr.ID(), events.Exited)
if err != nil {
logrus.Errorf("Cannot get exit code: %v", err)
report.ExitCode = define.ExecErrorCodeNotFound
} else {
report.ExitCode = event.ContainerExitCode
}
}
} else {
report.ExitCode = int(ecode)
}
report.ExitCode = ic.GetContainerExitCode(ctx, ctr)
if opts.Rm && !ctr.ShouldRestart(ctx) {
if err := ic.Libpod.RemoveContainer(ctx, ctr, false, true); err != nil {
if errors.Cause(err) == define.ErrNoSuchCtr ||
Expand All @@ -1013,6 +985,29 @@ func (ic *ContainerEngine) ContainerRun(ctx context.Context, opts entities.Conta
return &report, nil
}

func (ic *ContainerEngine) GetContainerExitCode(ctx context.Context, ctr *libpod.Container) int {
exitCode, err := ctr.Wait(ctx)
if err == nil {
return int(exitCode)
}
if errors.Cause(err) != define.ErrNoSuchCtr {
logrus.Errorf("Could not retrieve exit code: %v", err)
return define.ExecErrorCodeNotFound
}
// Make 4 attempt with 0.25s backoff between each for 1 second total
var event *events.Event
for i := 0; i < 4; i++ {
event, err = ic.Libpod.GetLastContainerEvent(ctx, ctr.ID(), events.Exited)
if err != nil {
time.Sleep(250 * time.Millisecond)
continue
}
return int(event.ContainerExitCode)
}
logrus.Errorf("Could not retrieve exit code from event: %v", err)
return define.ExecErrorCodeNotFound
}

func (ic *ContainerEngine) ContainerLogs(ctx context.Context, containers []string, options entities.ContainerLogsOptions) error {
if options.StdoutWriter == nil && options.StderrWriter == nil {
return errors.New("no io.Writer set for container logs")
Expand Down

0 comments on commit d45cbbd

Please sign in to comment.