openshift · sinnykumari · Nov 24, 2020 · yuqi-zhang · Nov 24, 2020 · sinnykumari
diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go
@@ -165,6 +165,14 @@ const (
 	onceFromRemoteConfig
 )
 
+type rebootAction int
+
+const (
+	rebootActionReboot rebootAction = iota
+	rebootActionNone
+	rebootActionReloadCrio
+)
+
 var (
 	defaultRebootTimeout = 24 * time.Hour
 )
@@ -1015,7 +1023,10 @@ func (dn *Daemon) checkStateOnFirstRun() error {
 		if err := dn.drain(); err != nil {
 			return err
 		}
-		return dn.finalizeAndReboot(state.pendingConfig)
+		if err := dn.finalizeBeforeReboot(state.pendingConfig); err != nil {
+			return err
+		}
+		return dn.reboot(fmt.Sprintf("Node will reboot into config %v", state.pendingConfig.GetName()))
 	}
 
 	if err := dn.detectEarlySSHAccessesFromBoot(); err != nil {
@@ -1043,7 +1054,10 @@ func (dn *Daemon) checkStateOnFirstRun() error {
 			if err := os.RemoveAll(osImageContentDir); err != nil {
 				return err
 			}
-			return dn.finalizeAndReboot(state.currentConfig)
+			if err := dn.finalizeBeforeReboot(state.currentConfig); err != nil {
+				return err
+			}
+			return dn.reboot(fmt.Sprintf("Node will reboot into config %v", state.currentConfig.GetName()))
 		}
 		glog.Info("No bootstrap pivot required; unlinking bootstrap node annotations")
 
@@ -1103,33 +1117,44 @@ func (dn *Daemon) checkStateOnFirstRun() error {
 		return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig)
 	}
 
-	// We've validated our state.  In the case where we had a pendingConfig,
-	// make that now currentConfig.  We update the node annotation, delete the
-	// state file, etc.
-	//
-	// However, it may be the case that desiredConfig changed while we
-	// were coming up, so we next look at that before uncordoning the node (so
-	// we don't uncordon and then immediately re-cordon)
+	// We've validated state. Now, ensure that node is in desired state
+	var inDesiredConfig bool
+	if inDesiredConfig, err = dn.updateConfigAndState(state); err != nil {
+		return err
+	}
+	if inDesiredConfig {
+		return nil
+	}
+
+	if dn.recorder != nil {
+		dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "BootResync", fmt.Sprintf("Booting node %s, currentConfig %s, desiredConfig %s", dn.node.Name, state.currentConfig.GetName(), state.desiredConfig.GetName()))
+	}
+	// currentConfig != desiredConfig, and we're not booting up into the desiredConfig.
+	// Kick off an update.
+	return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig)
+}
+
+// updateConfigAndState updates node to desired state, labels nodes as done and uncordon
+func (dn *Daemon) updateConfigAndState(state *stateAndConfigs) (bool, error) {
+	// In the case where we had a pendingConfig, make that now currentConfig.
+	// We update the node annotation, delete the state file, etc.
 	if state.pendingConfig != nil {
 		if dn.recorder != nil {
 			dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "NodeDone", fmt.Sprintf("Setting node %s, currentConfig %s to Done", dn.node.Name, state.pendingConfig.GetName()))
 		}
 		if err := dn.nodeWriter.SetDone(dn.kubeClient.CoreV1().Nodes(), dn.nodeLister, dn.name, state.pendingConfig.GetName()); err != nil {
-			return errors.Wrap(err, "error setting node's state to Done")
+			return true, errors.Wrap(err, "error setting node's state to Done")
 		}
 		if out, err := dn.storePendingState(state.pendingConfig, 0); err != nil {
-			return errors.Wrapf(err, "failed to reset pending config: %s", string(out))
+			return true, errors.Wrapf(err, "failed to reset pending config: %s", string(out))
 		}
 
 		state.currentConfig = state.pendingConfig
 	}
 
-	if state.bootstrapping {
-		if err := dn.storeCurrentConfigOnDisk(state.currentConfig); err != nil {
-			return err
-		}
-	}
-
+	// In case of node reboot, it may be the case that desiredConfig changed while we
+	// were coming up, so we next look at that before uncordoning the node (so
+	// we don't uncordon and then immediately re-cordon)
 	inDesiredConfig := state.currentConfig.GetName() == state.desiredConfig.GetName()
 	if inDesiredConfig {
 		if state.pendingConfig != nil {
@@ -1138,7 +1163,7 @@ func (dn *Daemon) checkStateOnFirstRun() error {
 			glog.Infof("Completing pending config %s", state.pendingConfig.GetName())
 			if err := dn.completeUpdate(dn.node, state.pendingConfig.GetName()); err != nil {
 				MCDUpdateState.WithLabelValues("", err.Error()).SetToCurrentTime()
-				return err
+				return inDesiredConfig, err
 			}
 		}
 		// If we're degraded here, it means we got an error likely on startup and we retried.
@@ -1147,22 +1172,16 @@ func (dn *Daemon) checkStateOnFirstRun() error {
 			if err := dn.nodeWriter.SetDone(dn.kubeClient.CoreV1().Nodes(), dn.nodeLister, dn.name, state.currentConfig.GetName()); err != nil {
 				errLabelStr := fmt.Sprintf("error setting node's state to Done: %v", err)
 				MCDUpdateState.WithLabelValues("", errLabelStr).SetToCurrentTime()
-				return errors.Wrap(err, "error setting node's state to Done")
+				return inDesiredConfig, errors.Wrap(err, "error setting node's state to Done")
 			}
 		}
 
 		glog.Infof("In desired config %s", state.currentConfig.GetName())
 		MCDUpdateState.WithLabelValues(state.currentConfig.GetName(), "").SetToCurrentTime()
 
 		// All good!
-		return nil
-	}
-	if dn.recorder != nil {
-		dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "BootResync", fmt.Sprintf("Booting node %s, currentConfig %s, desiredConfig %s", dn.node.Name, state.currentConfig.GetName(), state.desiredConfig.GetName()))
 	}
-	// currentConfig != desiredConfig, and we're not booting up into the desiredConfig.
-	// Kick off an update.
-	return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig)
+	return inDesiredConfig, nil
 }
 
 // runOnceFromMachineConfig utilizes a parsed machineConfig and executes in onceFrom

diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go
@@ -93,9 +93,56 @@ func getNodeRef(node *corev1.Node) *corev1.ObjectReference {
 	}
 }
 
-// finalizeAndReboot is the last step in an update(), and it can also
-// be called as a special case for the "bootstrap pivot".
-func (dn *Daemon) finalizeAndReboot(newConfig *mcfgv1.MachineConfig) (retErr error) {
+func reloadCrioConfig() error {
+	_, err := runGetOut("pkill", "-HUP", "crio")
+	return err
+}
+
+// performRebootAction takes action based on what rebootAction has been asked.
+// For non-reboot action, it applies configuration, updates node's config and state.
+// In the end uncordon node to schedule workload.
+// If at any point an error occurs, we reboot the node so that node has correct configuration.
+func (dn *Daemon) performRebootAction(action rebootAction, configName string) error {
+	switch action {
+	case rebootActionNone:
+		dn.logSystem("Node has Desired Config %s, skipping reboot", configName)
+	case rebootActionReloadCrio:
+		if err := reloadCrioConfig(); err != nil {
+			dn.logSystem("Reloading crio configuration failed, rebooting: %v", err)
+			dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName))
+		}
+		dn.logSystem("crio config reloaded successfully! Desired config %s has been applied, skipping reboot", configName)
+	default:
+		// Defaults to rebooting node
+		dn.logSystem("Rebooting node")
+		return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName))
+	}
+
+	// We are here, which means reboot was not needed to apply the configuration.
+
+	// Get current state of node, in case of an error reboot
+	state, err := dn.getStateAndConfigs(configName)
+	if err != nil {
+		glog.Errorf("Error processing state and configs, rebooting: %v", err)
+		return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName))
+	}
+
+	var inDesiredConfig bool
+	if inDesiredConfig, err = dn.updateConfigAndState(state); err != nil {
+		glog.Errorf("Setting node's state to Done failed, rebooting: %v", err)
+		return dn.reboot(fmt.Sprintf("Node will reboot into config %s", configName))
+	}
+	if inDesiredConfig {
+		return nil
+	}
+
+	// currentConfig != desiredConfig, kick off an update
+	return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig)
+}
+
+// finalizeBeforeReboot is the last step in an update() and then we take appropriate rebootAction.
+// It can also be called as a special case for the "bootstrap pivot".
+func (dn *Daemon) finalizeBeforeReboot(newConfig *mcfgv1.MachineConfig) (retErr error) {
 	if out, err := dn.storePendingState(newConfig, 1); err != nil {
 		return errors.Wrapf(err, "failed to log pending config: %s", string(out))
 	}
@@ -114,8 +161,7 @@ func (dn *Daemon) finalizeAndReboot(newConfig *mcfgv1.MachineConfig) (retErr err
 		dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "PendingConfig", fmt.Sprintf("Written pending config %s", newConfig.GetName()))
 	}
 
-	// reboot. this function shouldn't actually return.
-	return dn.reboot(fmt.Sprintf("Node will reboot into config %v", newConfig.GetName()))
+	return nil
 }
 
 func (dn *Daemon) drain() error {
@@ -515,7 +561,19 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err
 		glog.Info("Updated kernel tuning arguments")
 	}
 
-	return dn.finalizeAndReboot(newConfig)
+	if err := dn.finalizeBeforeReboot(newConfig); err != nil {
+		return err
+	}
+
+	// TODO: Need Jerry's work to determine exact reboot action
+	var action rebootAction
+	action = dn.getRebootAction()
+	return dn.performRebootAction(action, newConfig.GetName())
+}
+
+func (dn *Daemon) getRebootAction() rebootAction {
+	// Until we have logic, always reboot
+	return rebootActionReboot
 }
 
 // removeRollback removes the rpm-ostree rollback deployment.  It