Skip to content

Commit

Permalink
Merge pull request rook#4343 from jmolmo/issue_4342
Browse files Browse the repository at this point in the history
Ceph: Toleration for <NotReady> nodes set to 5 seconds
  • Loading branch information
travisn authored Dec 17, 2019
2 parents cfa34d2 + e8ccef8 commit dca9672
Show file tree
Hide file tree
Showing 15 changed files with 197 additions and 1 deletion.
2 changes: 2 additions & 0 deletions PendingReleaseNotes.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ To remove OSDs manually, see the new doc on [OSD Management](Documentation/ceph-
- When running on PVC, the OSD can be on a slow device class, Rook can adapt to that by tuning the OSD. This can be enabled by the CR setting `tuneSlowDeviceClass`
- RGWs:
- Ceph Object Gateway are automatically configured to not run on the same host if hostNetwork is activated
- New CR property available in the Operator: `ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS` (5 seconds by default). Represents the time to wait until the node controller will move Rook pods to other nodes after detecting an unreachable node. Pods affected by this setting are: mgr, rbd, mds, rgw, nfs, PVC based mons and osds, and ceph toolbox. The value used in this variable replaces the default value of 300 seconds added automatically by k8s as Pod Toleration for `node.kubernetes.io/unreachable`.
Now the total amount of time to reschedule Rook pods in healthy nodes before detecting a `not ready node` condition will be the sum of `node-monitor-grace-period` (k8s kube-controller-manager flag, 40 seconds by default) and `ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS` (5 seconds by default)

### EdgeFS

Expand Down
4 changes: 4 additions & 0 deletions cluster/charts/rook-ceph/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,10 @@ spec:
- name: ROOK_MON_OUT_TIMEOUT
value: {{ .Values.mon.monOutTimeout }}
{{- end }}
{{- end }}
{{- if .Values.unreachableNodeTolerationSeconds }}
- name: ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS
value: {{ .Values.unreachableNodeTolerationSeconds | quote }}
{{- end }}
resources:
{{ toYaml .Values.resources | indent 10 }}
Expand Down
3 changes: 3 additions & 0 deletions cluster/charts/rook-ceph/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ nodeSelector:
# Tolerations for the rook-ceph-operator to allow it to run on nodes with particular taints
tolerations: []

# Delay to use in node.kubernetes.io/unreachable toleration
unreachableNodeTolerationSeconds: 5

# Whether rook watches its current namespace for CRDs or the entire cluster, defaults to false
currentNamespaceOnly: false

Expand Down
15 changes: 15 additions & 0 deletions cluster/examples/kubernetes/ceph/operator-openshift.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,21 @@ spec:
# value: "9080"
- name: ROOK_HOSTPATH_REQUIRES_PRIVILEGED
value: "true"

# Time to wait until the node controller will move Rook pods to other
# nodes after detecting an unreachable node.
# Pods affected by this setting are:
# mgr, rbd, mds, rgw, nfs, PVC based mons and osds, and ceph toolbox
# The value used in this variable replaces the default value of 300 secs
# added automatically by k8s as Toleration for
# <node.kubernetes.io/unreachable>
# The total amount of time to reschedule Rook pods in healthy nodes
# before detecting a <not ready node> condition will be the sum of:
# --> node-monitor-grace-period: 40 seconds (k8s kube-controller-manager flag)
# --> ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS: 5 seconds
- name: ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS
value: "5"

# The name of the node to pass with the downward API
- name: NODE_NAME
valueFrom:
Expand Down
15 changes: 15 additions & 0 deletions cluster/examples/kubernetes/ceph/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,21 @@ spec:
# value: "9090"
#- name: CSI_RBD_LIVENESS_METRICS_PORT
# value: "9080"

# Time to wait until the node controller will move Rook pods to other
# nodes after detecting an unreachable node.
# Pods affected by this setting are:
# mgr, rbd, mds, rgw, nfs, PVC based mons and osds, and ceph toolbox
# The value used in this variable replaces the default value of 300 secs
# added automatically by k8s as Toleration for
# <node.kubernetes.io/unreachable>
# The total amount of time to reschedule Rook pods in healthy nodes
# before detecting a <not ready node> condition will be the sum of:
# --> node-monitor-grace-period: 40 seconds (k8s kube-controller-manager flag)
# --> ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS: 5 seconds
- name: ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS
value: "5"

# The name of the node to pass with the downward API
- name: NODE_NAME
valueFrom:
Expand Down
5 changes: 5 additions & 0 deletions cluster/examples/kubernetes/ceph/toolbox.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,8 @@ spec:
path: mon-endpoints
- name: ceph-config
emptyDir: {}
tolerations:
- key: "node.kubernetes.io/unreachable"
operator: "Exists"
effect: "NoExecute"
tolerationSeconds: 5
4 changes: 4 additions & 0 deletions pkg/operator/ceph/cluster/mgr/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ const (
)

func (c *Cluster) makeDeployment(mgrConfig *mgrConfig) *apps.Deployment {
logger.Debugf("mgrConfig: %+v", mgrConfig)
podSpec := v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Name: mgrConfig.ResourceName,
Expand All @@ -62,6 +63,9 @@ func (c *Cluster) makeDeployment(mgrConfig *mgrConfig) *apps.Deployment {
},
}

// Replace default unreachable node toleration
k8sutil.AddUnreachableNodeToleration(&podSpec.Spec)

// if the fix is needed, then the following init containers are created
// which explicitly configure the server_addr Ceph configuration option to
// be equal to the pod's IP address. Note that when the fix is not needed,
Expand Down
4 changes: 4 additions & 0 deletions pkg/operator/ceph/cluster/mon/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,10 @@ func (c *Cluster) makeMonPod(monConfig *monConfig) *v1.Pod {
if c.Network.IsHost() {
podSpec.DNSPolicy = v1.DNSClusterFirstWithHostNet
}
// Replace default unreachable node toleration
if c.spec.Mon.VolumeClaimTemplate != nil {
k8sutil.AddUnreachableNodeToleration(&podSpec)
}

pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Expand Down
4 changes: 4 additions & 0 deletions pkg/operator/ceph/cluster/osd/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,10 @@ func (c *Cluster) makeDeployment(osdProps osdProperties, osd OSDInfo, provisionC
if !osdProps.portable {
deployment.Spec.Template.Spec.NodeSelector = map[string]string{v1.LabelHostname: osdProps.crushHostname}
}
// Replace default unreachable node toleration if the osd pod is portable and based in PVC
if osdProps.pvc.ClaimName != "" && osdProps.portable {
k8sutil.AddUnreachableNodeToleration(&deployment.Spec.Template.Spec)
}

k8sutil.AddRookVersionLabelToDeployment(deployment)
c.annotations.ApplyToObjectMeta(&deployment.ObjectMeta)
Expand Down
3 changes: 3 additions & 0 deletions pkg/operator/ceph/cluster/rbd/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ func (m *Mirroring) makeDeployment(daemonConfig *daemonConfig) *apps.Deployment
PriorityClassName: m.priorityClassName,
},
}
// Replace default unreachable node toleration
k8sutil.AddUnreachableNodeToleration(&podSpec.Spec)

if m.Network.IsHost() {
podSpec.Spec.DNSPolicy = v1.DNSClusterFirstWithHostNet
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/operator/ceph/file/mds/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ func (c *Cluster) makeDeployment(mdsConfig *mdsConfig) *apps.Deployment {
PriorityClassName: c.fs.Spec.MetadataServer.PriorityClassName,
},
}
// Replace default unreachable node toleration
k8sutil.AddUnreachableNodeToleration(&podSpec.Spec)

if c.clusterSpec.Network.IsHost() {
podSpec.Spec.DNSPolicy = v1.DNSClusterFirstWithHostNet
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/operator/ceph/nfs/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ func (c *CephNFSController) makeDeployment(nfs cephv1.CephNFS, cfg daemonConfig)
HostNetwork: c.clusterSpec.Network.IsHost(),
PriorityClassName: nfs.Spec.Server.PriorityClassName,
}
// Replace default unreachable node toleration
k8sutil.AddUnreachableNodeToleration(&podSpec)

if c.clusterSpec.Network.IsHost() {
podSpec.DNSPolicy = v1.DNSClusterFirstWithHostNet
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/operator/ceph/object/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ func (c *clusterConfig) makeRGWPodSpec(rgwConfig *rgwConfig) v1.PodTemplateSpec
HostNetwork: c.clusterSpec.Network.IsHost(),
PriorityClassName: c.store.Spec.Gateway.PriorityClassName,
}
// Replace default unreachable node toleration
k8sutil.AddUnreachableNodeToleration(&podSpec)

if c.clusterSpec.Network.IsHost() {
podSpec.DNSPolicy = v1.DNSClusterFirstWithHostNet
}
Expand Down
34 changes: 34 additions & 0 deletions pkg/operator/k8sutil/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"io"
"os"
"path"
"strconv"
"strings"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -119,6 +120,39 @@ func GetSpecContainerImage(spec v1.PodSpec, name string, initContainer bool) (st
return image.Image, nil
}

// Replaces the pod default toleration of 300s used when the node controller
// detect a not ready node (node.kubernetes.io/unreachable)
func AddUnreachableNodeToleration(podSpec *v1.PodSpec) {
// The amount of time for this pod toleration can be modified by users
// changing the value of <ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS> Rook operator
// variable.
// Node controller will wait 40 seconds by default before mark a node as
// unreachable. After 40s + ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS the pod
// will be scheduled in other node
// Only one <toleration> to <unreachable> nodes can be added
var tolerationSeconds int64 = 5
urTolerationSeconds := os.Getenv("ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS")
if urTolerationSeconds != "" {
if duration, err := strconv.ParseInt(urTolerationSeconds, 10, 64); err != nil {
logger.Warningf("using default value for <node.kubernetes.io/unreachable> toleration: %v seconds", tolerationSeconds)
} else {
tolerationSeconds = duration
}
}
urToleration := v1.Toleration{Key: "node.kubernetes.io/unreachable",
Operator: "Exists",
Effect: "NoExecute",
TolerationSeconds: &tolerationSeconds}

for index, item := range podSpec.Tolerations {
if item.Key == "node.kubernetes.io/unreachable" {
podSpec.Tolerations[index] = urToleration
return
}
}
podSpec.Tolerations = append(podSpec.Tolerations, urToleration)
}

// GetRunningPod reads the name and namespace of a pod from the
// environment, and returns the pod (if it exists).
func GetRunningPod(clientset kubernetes.Interface) (*v1.Pod, error) {
Expand Down
96 changes: 95 additions & 1 deletion pkg/operator/k8sutil/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@ limitations under the License.
package k8sutil

import (
"os"
"testing"

"github.com/stretchr/testify/assert"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand Down Expand Up @@ -80,3 +81,96 @@ func TestGetPodPhaseMap(t *testing.T) {
// list of failed pods should have 1 entry
assert.Equal(t, 1, len(podPhaseMap[v1.PodFailed]))
}

func newToleration(defaultSeconds int64, tolerationKey string) v1.Toleration {
return v1.Toleration{Key: tolerationKey,
Operator: "Exists",
Effect: "NoExecute",
TolerationSeconds: &defaultSeconds}
}

func TestAddUnreachableNodeToleration(t *testing.T) {
podSpec := v1.PodSpec{}

// -------------------------------------------------------------------------
// Test one toleration of 5 seconds
expectedURToleration := newToleration(5, "node.kubernetes.io/unreachable")

// Change the UR toleration in the pod using env var and the tested function
os.Setenv("ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS", "5")
AddUnreachableNodeToleration(&podSpec)

assert.Equal(t, 1, len(podSpec.Tolerations))
assert.Equal(t, expectedURToleration, podSpec.Tolerations[0])

//--------------------------------------------------------------------------
// Test adding one additional toleration, replaces the previous one,
// keeping only the last.
expectedURToleration = newToleration(6, "node.kubernetes.io/unreachable")

// Change the UR toleration in the pod using env var and the tested function
os.Setenv("ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS", "6")
AddUnreachableNodeToleration(&podSpec)

assert.Equal(t, 1, len(podSpec.Tolerations))
assert.Equal(t, expectedURToleration, podSpec.Tolerations[0])

//--------------------------------------------------------------------------
// Changing the toleration at the beginning of the list
urTol := newToleration(10, "node.kubernetes.io/unreachable")
otherTol := newToleration(20, "node.kubernetes.io/network-unavailable")

podSpec.Tolerations = nil
podSpec.Tolerations = append(podSpec.Tolerations, urTol, otherTol)

expectedURToleration = newToleration(7, "node.kubernetes.io/unreachable")

// Change the Unreachable node toleration
os.Setenv("ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS", "7")
AddUnreachableNodeToleration(&podSpec)

assert.Equal(t, 2, len(podSpec.Tolerations))
assert.Equal(t, expectedURToleration, podSpec.Tolerations[0])

//--------------------------------------------------------------------------
// Changing the toleration at the middle of the list
podSpec.Tolerations = nil
podSpec.Tolerations = append(podSpec.Tolerations, otherTol, urTol, otherTol)

expectedURToleration = newToleration(8, "node.kubernetes.io/unreachable")

// Change the Unreachable node toleration
os.Setenv("ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS", "8")
AddUnreachableNodeToleration(&podSpec)

assert.Equal(t, 3, len(podSpec.Tolerations))
assert.Equal(t, expectedURToleration, podSpec.Tolerations[1])

//--------------------------------------------------------------------------
// Changing the toleration at the end of the list
podSpec.Tolerations = nil
podSpec.Tolerations = append(podSpec.Tolerations, otherTol, urTol)

expectedURToleration = newToleration(9, "node.kubernetes.io/unreachable")

// Change the Unreachable node toleration
os.Setenv("ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS", "9")
AddUnreachableNodeToleration(&podSpec)

assert.Equal(t, 2, len(podSpec.Tolerations))
assert.Equal(t, expectedURToleration, podSpec.Tolerations[1])

// Environment var with wrong value format results in using default value
podSpec.Tolerations = nil

// The default value used for the Unreachable Node Toleration is 5 seconds
expectedURToleration = newToleration(5, "node.kubernetes.io/unreachable")

// Change the Unreachable node toleration using wrong format
os.Setenv("ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS", "9s")
AddUnreachableNodeToleration(&podSpec)

assert.Equal(t, 1, len(podSpec.Tolerations))
assert.Equal(t, expectedURToleration, podSpec.Tolerations[0])

}

0 comments on commit dca9672

Please sign in to comment.