From 9e58a598e83e218ec671258fdfbeeb52e1a98742 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Thu, 27 Jun 2024 14:48:49 +0200 Subject: [PATCH] criu swrk: return child error to caller In the podman CI we are seeing a weird flake during criu version detection[1]. The write to the socket just fails with broken pipe. The logical thing to assume here is that the child exited. However the current code never reports back the child error from wait. The cleanup error is now added to the returned error so the caller sees both. The output is not captured as this causes hangs when the fds are passed into child processes. As errors.Join is used from the std lib bump the minimum go version to 1.20. [1] https://github.com/containers/podman/issues/18856 Signed-off-by: Paul Holzinger --- main.go | 24 +++++++++++++++++------- phaul/client.go | 11 +++++++++-- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/main.go b/main.go index 6906c9811..8f29d2ee2 100644 --- a/main.go +++ b/main.go @@ -61,13 +61,19 @@ func (c *Criu) Prepare() error { } // Cleanup cleans up -func (c *Criu) Cleanup() { +func (c *Criu) Cleanup() error { + var errs []error if c.swrkCmd != nil { - c.swrkSk.Close() + if err := c.swrkSk.Close(); err != nil { + errs = append(errs, err) + } c.swrkSk = nil - _ = c.swrkCmd.Wait() + if err := c.swrkCmd.Wait(); err != nil { + errs = append(errs, fmt.Errorf("criu swrk failed: %w", err)) + } c.swrkCmd = nil } + return errors.Join(errs...) } func (c *Criu) sendAndRecv(reqB []byte) ([]byte, int, error) { @@ -99,9 +105,7 @@ func (c *Criu) doSwrk(reqType rpc.CriuReqType, opts *rpc.CriuOpts, nfy Notify) e return nil } -func (c *Criu) doSwrkWithResp(reqType rpc.CriuReqType, opts *rpc.CriuOpts, nfy Notify, features *rpc.CriuFeatures) (*rpc.CriuResp, error) { - var resp *rpc.CriuResp - +func (c *Criu) doSwrkWithResp(reqType rpc.CriuReqType, opts *rpc.CriuOpts, nfy Notify, features *rpc.CriuFeatures) (resp *rpc.CriuResp, retErr error) { req := rpc.CriuReq{ Type: &reqType, Opts: opts, @@ -121,7 +125,13 @@ func (c *Criu) doSwrkWithResp(reqType rpc.CriuReqType, opts *rpc.CriuOpts, nfy N return nil, err } - defer c.Cleanup() + defer func() { + // append any cleanup errors to the returned error + err := c.Cleanup() + if err != nil { + retErr = errors.Join(retErr, err) + } + }() } for { diff --git a/phaul/client.go b/phaul/client.go index 8925ad1c8..e84ef6f1b 100644 --- a/phaul/client.go +++ b/phaul/client.go @@ -1,6 +1,7 @@ package phaul import ( + "errors" "fmt" "github.com/checkpoint-restore/go-criu/v7" @@ -55,7 +56,7 @@ func isLastIter(iter int, stats *stats.DumpStatsEntry, prevStats *stats.DumpStat } // Migrate function -func (pc *Client) Migrate() error { +func (pc *Client) Migrate() (retErr error) { criu := criu.MakeCriu() psi := rpc.CriuPageServerInfo{ Fd: proto.Int32(int32(pc.cfg.Memfd)), @@ -72,7 +73,13 @@ func (pc *Client) Migrate() error { return err } - defer criu.Cleanup() + defer func() { + // append any cleanup errors to the returned error + err := criu.Cleanup() + if err != nil { + retErr = errors.Join(retErr, err) + } + }() imgs, err := preparePhaulImages(pc.cfg.Wdir) if err != nil {