Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean Systemd files on exit #792

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 40 additions & 18 deletions cmd/sriov-network-config-daemon/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ import (
"net"
"net/url"
"os"
"os/signal"
"strings"
"syscall"
"time"

"github.com/spf13/cobra"
Expand Down Expand Up @@ -134,7 +136,6 @@ func runStartCmd(cmd *cobra.Command, args []string) error {

// This channel is used to ensure all spawned goroutines exit when we exit.
stopCh := make(chan struct{})
defer close(stopCh)

// This channel is used to signal Run() something failed and to jump ship.
// It's purely a chan<- in the Daemon struct for goroutines to write to, and
Expand Down Expand Up @@ -286,6 +287,7 @@ func runStartCmd(cmd *cobra.Command, args []string) error {
err = kClient.Get(context.Background(), types.NamespacedName{Namespace: vars.Namespace, Name: consts.DefaultConfigName}, defaultConfig)
if err != nil {
log.Log.Error(err, "Failed to get default SriovOperatorConfig object")
close(stopCh)
return err
}
featureGates := featuregate.New()
Expand All @@ -294,25 +296,45 @@ func runStartCmd(cmd *cobra.Command, args []string) error {
log.Log.Info("Enabled featureGates", "featureGates", featureGates.String())

setupLog.V(0).Info("Starting SriovNetworkConfigDaemon")
err = daemon.New(
kClient,
snclient,
kubeclient,
hostHelpers,
platformHelper,
exitCh,
stopCh,
syncCh,
refreshCh,
eventRecorder,
featureGates,
startOpts.disabledPlugins,
).Run(stopCh, exitCh)
if err != nil {
setupLog.Error(err, "failed to run daemon")

// create a signal channel to catch interrupts and gracefully shutdown the daemon
sigc := make(chan os.Signal, 1)
signal.Notify(sigc, os.Interrupt)
signal.Notify(sigc, syscall.SIGTERM)

errChan := make(chan error)
defer close(errChan)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used the method used here to close channels. But I believe there is a race in general, on exit (once main goes away, or if os.Exit is called) the goroutines are killed off immediately. So the goroutines listening on this channel will never get the chance to run their intended actions.

go func() {
errChan <- daemon.New(
kClient,
snclient,
kubeclient,
hostHelpers,
platformHelper,
exitCh,
stopCh,
syncCh,
refreshCh,
eventRecorder,
featureGates,
startOpts.disabledPlugins,
).Run(stopCh, exitCh)
}()

select {
case err := <-errChan:
// daemon has exited, close the stop channel and return the error
close(stopCh)
return err
case <-sigc:
// signal received, close the stop channel and wait for the daemon to exit
close(stopCh)
if err := <-errChan; err != nil {
return err
}
}
setupLog.V(0).Info("Shutting down SriovNetworkConfigDaemon")
return err
return nil
}

// updateDialer instruments a restconfig with a dial. the returned function allows forcefully closing all active connections.
Expand Down
21 changes: 17 additions & 4 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ type Daemon struct {

refreshCh chan<- Message

mu *sync.Mutex
mu sync.Mutex

disableDrain bool

Expand Down Expand Up @@ -120,7 +120,6 @@ func New(
eventRecorder: er,
featureGate: featureGates,
disabledPlugins: disabledPlugins,
mu: &sync.Mutex{},
}
}

Expand Down Expand Up @@ -207,6 +206,20 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {
for {
select {
case <-stopCh:
// clean files from host if we are running in systemd mode and the node
// is not required to be rebooted
dn.mu.Lock()
rebootrequired := utils.ObjectHasAnnotation(dn.desiredNodeState,
consts.NodeStateDrainAnnotation, consts.RebootRequired)
dn.mu.Unlock()

if vars.UsingSystemdMode && !rebootrequired {
err := systemd.CleanSriovFilesFromHost(vars.ClusterType == consts.ClusterTypeOpenshift)
Copy link
Collaborator

@adrianchiris adrianchiris Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC when the daemon is stopped we delete systemd files.

if the node reboots (given pods get stop signal) wont it delete the files ? which in turn mean the node will not have SR-IOV configured on startup ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What you say is correct. We should have a way to identify whether a reboot has been requested. In that case the cleanUp has to be a best effort because we can have an ongoing reboot and also a spec change in the daemonSet.

if err != nil {
log.Log.Error(err, "failed to remove all the systemd sriov files")
return err
}
}
log.Log.V(0).Info("Run(): stop daemon")
return nil
case err, more := <-exitCh:
Expand Down Expand Up @@ -316,6 +329,8 @@ func (dn *Daemon) operatorConfigChangeHandler(old, new interface{}) {
}

func (dn *Daemon) nodeStateSyncHandler() error {
dn.mu.Lock()
defer dn.mu.Unlock()
var err error
// Get the latest NodeState
var sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusSucceeded, LastSyncError: ""}
Expand Down Expand Up @@ -679,8 +694,6 @@ func (dn *Daemon) handleDrain(reqReboot bool) (bool, error) {
}

func (dn *Daemon) restartDevicePluginPod() error {
dn.mu.Lock()
defer dn.mu.Unlock()
log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod")

pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
Expand Down
Loading