From dd04c5f1bcb19b733a64d2bf2b5384ec5bd1ca6e Mon Sep 17 00:00:00 2001 From: Travis Nielsen Date: Thu, 14 Nov 2019 16:19:43 -0700 Subject: [PATCH] ceph: add anti-affinity for the example of osds on pvcs OSDs on PVCs need some anti-affinity or other node affinity in order to attempt to spread the OSDs across nodes. The OSD anti-affinity is only effective when the number of OSDs matches the node count, but at least it is a start. Signed-off-by: Travis Nielsen --- Documentation/ceph-cluster-crd.md | 6 ++- .../kubernetes/ceph/cluster-on-pvc.yaml | 42 +++++++++++++------ 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/Documentation/ceph-cluster-crd.md b/Documentation/ceph-cluster-crd.md index fe76bff207bc..d898a4eb18cd 100755 --- a/Documentation/ceph-cluster-crd.md +++ b/Documentation/ceph-cluster-crd.md @@ -231,8 +231,10 @@ The following are the settings for Storage Class Device Sets which can be config * `name`: A name for the set. * `count`: The number of devices in the set. -* `resources`: The CPU and RAM requests/limits for the devices.(Optional) -* `placement`: The placement criteria for the devices. Default is no placement criteria.(Optional) +* `resources`: The CPU and RAM requests/limits for the devices. (Optional) +* `placement`: The placement criteria for the devices. (Optional) Default is no placement criteria. It is recommended to configure the placement such that the OSDs will be +as evenly spread across nodes as possible. At a minimum, anti-affinity should be added so at least one OSD will be placed on each available nodes. +However, if there are more OSDs than nodes, this anti-affinity will not be effective. Another placement scheme to consider is to add labels to the nodes in such a way that the OSDs can be grouped on those nodes, create multiple storageClassDeviceSets, and add node affinity to each of the device sets that will place the OSDs in those sets of nodes. * `portable`: If `true`, the OSDs will be allowed to move between nodes during failover. This requires a storage class that supports portability (e.g. `aws-ebs`, but not the local storage provisioner). If `false`, the OSDs will be assigned to a node permanently. Rook will configure Ceph's CRUSH map to support the portability. * `volumeClaimTemplates`: A list of PVC templates to use for provisioning the underlying storage devices. * `resources.requests.storage`: The desired capacity for the underlying storage devices. diff --git a/cluster/examples/kubernetes/ceph/cluster-on-pvc.yaml b/cluster/examples/kubernetes/ceph/cluster-on-pvc.yaml index c9afb1a75132..3f480df3c6f2 100644 --- a/cluster/examples/kubernetes/ceph/cluster-on-pvc.yaml +++ b/cluster/examples/kubernetes/ceph/cluster-on-pvc.yaml @@ -43,7 +43,37 @@ spec: topologyAware: true storageClassDeviceSets: - name: set1 + # The number of OSDs to create from this device set count: 3 + # IMPORTANT: If volumes specified by the storageClassName are not portable across nodes + # this needs to be set to false. For example, if using the local storage provisioner + # this should be false. + portable: true + # Since the OSDs could end up on any node, an effort needs to be made to spread the OSDs + # across nodes as much as possible. Unfortunately the pod anti-affinity breaks down + # as soon as you have more than one OSD per node. If you have more OSDs than nodes, K8s may + # choose to schedule many of them on the same node. What we need is the Pod Topology + # Spread Constraints, which is alpha in K8s 1.16. This means that a feature gate must be + # enabled for this feature, and Rook also still needs to add support for this feature. + # Another approach for a small number of OSDs is to create a separate device set for each + # zone (or other set of nodes with a common label) so that the OSDs will end up on different + # nodes. This would require adding nodeAffinity to the placement here. + placement: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - rook-ceph-osd + - key: app + operator: In + values: + - rook-ceph-osd-prepare + topologyKey: kubernetes.io/hostname resources: # limits: # cpu: "500m" @@ -51,18 +81,6 @@ spec: # requests: # cpu: "500m" # memory: "4Gi" - # placement: - # podAntiAffinity: - # preferredDuringSchedulingIgnoredDuringExecution: - # - weight: 100 - # podAffinityTerm: - # labelSelector: - # matchExpressions: - # - key: "rook.io/cluster" - # operator: In - # values: - # - cluster1 - # topologyKey: "failure-domain.beta.kubernetes.io/zone" volumeClaimTemplates: - metadata: name: data