From b7dbc505b6a7fbc14ba559acab31baf7d37f8b62 Mon Sep 17 00:00:00 2001 From: Matthew Heon Date: Mon, 23 May 2022 09:34:37 -0400 Subject: [PATCH] Instead of erroring, clean up after dangling IDs in DB For various (mostly legacy) reasons, Podman presently maintains a unified namespace for pods and containers - IE, we cannot have both a pod and a container named "test" at the same time. To implement this, we use a global database table of every pod and container ID (and another of every pod and container name). These entries should be added when containers/pods are added, and removed when containers/pods are removed, with the database's transactional integrity providing a guarantee that this is batched with the overall removal and that the DB should remain sane and consistent no matter what. As such, we treat a dangling ID as a hard error that stops the use of Podman. Unfortunately, we have someone run into this last Friday. I'm still not certain how exactly their DB got into this state, but without further clarification there, we can consider removing the error and making Podman instead clean up and remove any dangling IDs, which should restore Podman to a serviceable state. Drop an error message if we do this, though, because people should know that the DB is in a bad state. [NO NEW TESTS NEEDED] it is deliberately impossible to produce a configuration that would test this without hex-editing the DB file. Signed-off-by: Matthew Heon --- libpod/boltdb_state.go | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/libpod/boltdb_state.go b/libpod/boltdb_state.go index 9745121c77..c3db6152a9 100644 --- a/libpod/boltdb_state.go +++ b/libpod/boltdb_state.go @@ -162,6 +162,11 @@ func (s *BoltState) Refresh() error { return err } + namesBucket, err := getNamesBucket(tx) + if err != nil { + return err + } + ctrsBucket, err := getCtrBucket(tx) if err != nil { return err @@ -192,6 +197,7 @@ func (s *BoltState) Refresh() error { // PID, mountpoint, and state for all of them // Then save the modified state // Also clear all network namespaces + toRemoveIDs := []string{} err = idBucket.ForEach(func(id, name []byte) error { ctrBkt := ctrsBucket.Bucket(id) if ctrBkt == nil { @@ -199,8 +205,16 @@ func (s *BoltState) Refresh() error { podBkt := podsBucket.Bucket(id) if podBkt == nil { // This is neither a pod nor a container - // Error out on the dangling ID - return errors.Wrapf(define.ErrInternal, "id %s is not a pod or a container", string(id)) + // Something is seriously wrong, but + // continue on and try to clean up the + // state and become consistent. + // Just note what needs to be removed + // for now - ForEach says you shouldn't + // remove things from the table during + // it. + logrus.Errorf("Database issue: dangling ID %s found (not a pod or container) - removing", string(id)) + toRemoveIDs = append(toRemoveIDs, string(id)) + return nil } // Get the state @@ -285,6 +299,24 @@ func (s *BoltState) Refresh() error { return err } + // Remove dangling IDs. + for _, id := range toRemoveIDs { + // Look up the ID to see if we also have a dangling name + // in the DB. + name := idBucket.Get([]byte(id)) + if name != nil { + if testID := namesBucket.Get(name); testID != nil { + logrus.Infof("Found dangling name %s (ID %s) in database", string(name), id) + if err := namesBucket.Delete(name); err != nil { + return errors.Wrapf(err, "error removing dangling name %s (ID %s) from database", string(name), id) + } + } + } + if err := idBucket.Delete([]byte(id)); err != nil { + return errors.Wrapf(err, "error removing dangling ID %s from database", id) + } + } + // Now refresh volumes err = allVolsBucket.ForEach(func(id, name []byte) error { dbVol := volBucket.Bucket(id)