From 4245265750fdd4186ab9a3f2dddab5c102c1a42d Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Thu, 19 May 2022 18:04:19 -0700 Subject: [PATCH 01/36] Update autoscaler image. --- ray-operator/controllers/common/pod.go | 12 ++++++------ ray-operator/controllers/common/pod_test.go | 7 +++---- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/ray-operator/controllers/common/pod.go b/ray-operator/controllers/common/pod.go index c39335a2afe..7b05e34af8f 100644 --- a/ray-operator/controllers/common/pod.go +++ b/ray-operator/controllers/common/pod.go @@ -54,7 +54,7 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a } // inject autoscaler pod into head pod - container := BuildAutoscalerContainer(redisPasswd) + container := BuildAutoscalerContainer() podTemplate.Spec.Containers = append(podTemplate.Spec.Containers, container) // set custom service account which can be authorized to talk with apiserver podTemplate.Spec.ServiceAccountName = instance.Name @@ -159,11 +159,12 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN } // BuildAutoscalerContainer build a ray autoscaler container which can be appended to head pod. -func BuildAutoscalerContainer(redisPasswd string) v1.Container { +func BuildAutoscalerContainer() v1.Container { container := v1.Container{ Name: "autoscaler", // TODO: choose right version based on instance.spec.Version - Image: "kuberay/autoscaler:nightly", + // Current image reflects changes up to https://github.com/ray-project/ray/pull/24718 + Image: "rayproject/ray:448f52", ImagePullPolicy: v1.PullAlways, Env: []v1.EnvVar{ { @@ -187,13 +188,12 @@ func BuildAutoscalerContainer(redisPasswd string) v1.Container { "/home/ray/anaconda3/bin/python", }, Args: []string{ - "/home/ray/run_autoscaler_with_retries.py", + "ray", + "kuberay-autoscaler", "--cluster-name", "$(RAY_CLUSTER_NAME)", "--cluster-namespace", "$(RAY_CLUSTER_NAMESPACE)", - "--redis-password", - redisPasswd, }, // TODO: make resource requirement configurable. Resources: v1.ResourceRequirements{ diff --git a/ray-operator/controllers/common/pod_test.go b/ray-operator/controllers/common/pod_test.go index 60bb9301a85..775ef62a69b 100644 --- a/ray-operator/controllers/common/pod_test.go +++ b/ray-operator/controllers/common/pod_test.go @@ -109,7 +109,7 @@ var instance = rayiov1alpha1.RayCluster{ var autoscalerContainer = v1.Container{ Name: "autoscaler", - Image: "kuberay/autoscaler:nightly", + Image: "rayproject/ray:448f52", ImagePullPolicy: v1.PullAlways, Env: []v1.EnvVar{ { @@ -133,13 +133,12 @@ var autoscalerContainer = v1.Container{ "/home/ray/anaconda3/bin/python", }, Args: []string{ - "/home/ray/run_autoscaler_with_retries.py", + "ray", + "kuberay-autoscaler", "--cluster-name", "$(RAY_CLUSTER_NAME)", "--cluster-namespace", "$(RAY_CLUSTER_NAMESPACE)", - "--redis-password", - DefaultRedisPassword, }, Resources: v1.ResourceRequirements{ Limits: v1.ResourceList{ From 8feae85df26e1489c13755d6bd986a971ce82ba4 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Thu, 19 May 2022 18:17:28 -0700 Subject: [PATCH 02/36] Trailing spaces. --- manifests/base/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index 6b850850b17..3b8d1c3b188 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -11,7 +11,7 @@ resources: images: - name: kuberay/apiserver newName: kuberay/apiserver - newTag: nightly + newTag: nightly - name: kuberay/operator newName: kuberay/operator newTag: nightly From ad3e46333dddb4799d4bea60141727b20c49aeff Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Thu, 19 May 2022 19:13:33 -0700 Subject: [PATCH 03/36] Add overlays. --- manifests/overlays/autoscaling/kustomization.yaml | 9 +++++++++ .../prioritize_workers_to_delete_patch.json | 5 +++++ .../prioritize_workers_to_delete_patch.yaml | 12 ++++++++++++ 3 files changed, 26 insertions(+) create mode 100644 manifests/overlays/autoscaling/kustomization.yaml create mode 100644 manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.json create mode 100644 manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.yaml diff --git a/manifests/overlays/autoscaling/kustomization.yaml b/manifests/overlays/autoscaling/kustomization.yaml new file mode 100644 index 00000000000..3b8a915b3d5 --- /dev/null +++ b/manifests/overlays/autoscaling/kustomization.yaml @@ -0,0 +1,9 @@ +bases: +- ../../base +patches: +- path: prioritize_workers_to_delete_patch.json + target: + group: apps + version: v1 + kind: Deployment + name: kuberay-operator diff --git a/manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.json b/manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.json new file mode 100644 index 00000000000..eaf63b5cb18 --- /dev/null +++ b/manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.json @@ -0,0 +1,5 @@ +[{ + "op":"replace", + "path":"/spec/template/spec/containers/0/args", + "value": ["--prioritize-workers-to-delete"] +}] diff --git a/manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.yaml b/manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.yaml new file mode 100644 index 00000000000..1381f2d1ab6 --- /dev/null +++ b/manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.yaml @@ -0,0 +1,12 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kuberay-operator + namespace: system +spec: + template: + spec: + containers: + - args: --prioritize-workers-to-delete + + type: NodePort From 076fa541c5d727de93e8805ce88f5c5dbeebf97e Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Thu, 19 May 2022 19:43:07 -0700 Subject: [PATCH 04/36] Add to docs. --- docs/guidance/autoscaler.md | 13 ++++++++----- manifests/overlays/autoscaling/kustomization.yaml | 2 ++ .../config/samples/ray-cluster.autoscaler.yaml | 6 +++--- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/docs/guidance/autoscaler.md b/docs/guidance/autoscaler.md index acd2fd25f21..8fc598c6f38 100644 --- a/docs/guidance/autoscaler.md +++ b/docs/guidance/autoscaler.md @@ -11,9 +11,13 @@ You can follow below steps for a quick deployment. git clone https://github.com/ray-project/kuberay.git cd kuberay kubectl apply -k manifests/cluster-scope-resources -kubectl apply -k manifests/base +kubectl apply -k manifests/overlays/autoscaling ``` +> Note: For compatibility with the Ray autoscaler, the KubeRay Operator's entrypoint +> must include the flag `--prioritize-workers-to-delete`. The kustomization overlay +> `manifests/overlays/autoscaling` provided in the last command above adds the necessary flag. + ### Deploy a cluster with autoscaling enabled ``` @@ -69,11 +73,10 @@ Demands: enableInTreeAutoscaling: true ``` -2. head and work images are `rayproject/ray:413fe0`. This image was built based on [commit](https://github.com/ray-project/ray/commit/413fe08f8744d50b439717564709bc0af2f778f1) from master branch. -The reason we need to use a nightly version is because autoscaler needs to connect to Ray cluster. Due to ray [version requirements](https://docs.ray.io/en/latest/cluster/ray-client.html#versioning-requirements). -We determine to use nightly version to make sure integration is working. +2. The autoscaler image is `rayproject/ray:448f52` which reflects the latest changes from [Ray PR #24718](https://github.com/ray-project/ray/pull/24718/files) in the master branch. -3. Autoscaler image is `kuberay/autoscaler:nightly` which is built from [commit](https://github.com/ray-project/ray/pull/22689/files). +3. Autoscaling functionality is supported only for Ray version at least as new as 1.11.0. The autoscaler image used +is compatible with all Ray versions >= 1.11.0. ### Test autoscaling diff --git a/manifests/overlays/autoscaling/kustomization.yaml b/manifests/overlays/autoscaling/kustomization.yaml index 3b8a915b3d5..e18c10de282 100644 --- a/manifests/overlays/autoscaling/kustomization.yaml +++ b/manifests/overlays/autoscaling/kustomization.yaml @@ -1,3 +1,5 @@ +# This overlay patches in KubeRay operator configuration +# necessary for Ray Autoscaler support. bases: - ../../base patches: diff --git a/ray-operator/config/samples/ray-cluster.autoscaler.yaml b/ray-operator/config/samples/ray-cluster.autoscaler.yaml index 599020bd5f9..393b5641da8 100644 --- a/ray-operator/config/samples/ray-cluster.autoscaler.yaml +++ b/ray-operator/config/samples/ray-cluster.autoscaler.yaml @@ -8,7 +8,7 @@ metadata: # An unique identifier for the head node and workers of this cluster. name: raycluster-autoscaler spec: - rayVersion: 'nightly' + rayVersion: '1.12.1' enableInTreeAutoscaling: true ######################headGroupSpecs################################# # head group template and specs, (perhaps 'group' is not needed in the name) @@ -39,7 +39,7 @@ spec: containers: # The Ray head pod - name: ray-head - image: rayproject/ray:413fe0 + image: rayproject/ray:1.12.1 imagePullPolicy: Always env: - name: CPU_REQUEST @@ -124,7 +124,7 @@ spec: command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] containers: - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' - image: rayproject/ray:413fe0 + image: rayproject/ray:1.12.1 # environment variables to set in the container.Optional. # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ env: From ba25502e23a3466c97ffd3d6b5235214562c1103 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Thu, 19 May 2022 20:07:26 -0700 Subject: [PATCH 05/36] Remove redis in a couple of spots. --- ray-operator/controllers/common/pod.go | 8 -------- ray-operator/controllers/common/pod_test.go | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/ray-operator/controllers/common/pod.go b/ray-operator/controllers/common/pod.go index 7b05e34af8f..bf44e49de10 100644 --- a/ray-operator/controllers/common/pod.go +++ b/ray-operator/controllers/common/pod.go @@ -45,14 +45,6 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a // set custom service account with proper roles bound. podTemplate.Spec.ServiceAccountName = instance.Name - // Note: Starting with the upcoming Ray 1.11.0, Ray will by default no longer use Redis - // should be possible to drop some of the logic around Redis passwords at that point. - // TODO(jiaxin.shan): Add version compatibility for 1.11.0 later. - redisPasswd := instance.Spec.HeadGroupSpec.RayStartParams["redis-password"] - if len(redisPasswd) == 0 { - redisPasswd = DefaultRedisPassword - } - // inject autoscaler pod into head pod container := BuildAutoscalerContainer() podTemplate.Spec.Containers = append(podTemplate.Spec.Containers, container) diff --git a/ray-operator/controllers/common/pod_test.go b/ray-operator/controllers/common/pod_test.go index 775ef62a69b..123dc5e5e94 100644 --- a/ray-operator/controllers/common/pod_test.go +++ b/ray-operator/controllers/common/pod_test.go @@ -259,7 +259,7 @@ func TestDefaultHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) { } func TestBuildAutoscalerContainer(t *testing.T) { - actualContainer := BuildAutoscalerContainer(DefaultRedisPassword) + actualContainer := BuildAutoscalerContainer() expectedContainer := autoscalerContainer if !reflect.DeepEqual(expectedContainer, actualContainer) { t.Fatalf("Expected `%v` but got `%v`", expectedContainer, actualContainer) From 420f4d6826fc6cb71c0a3ff45d7b60570f0319cd Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Thu, 19 May 2022 20:12:14 -0700 Subject: [PATCH 06/36] Namespace selector came out of somewhere... --- .../config/crd/bases/ray.io_rayclusters.yaml | 1184 ++++++++++++++--- 1 file changed, 993 insertions(+), 191 deletions(-) diff --git a/ray-operator/config/crd/bases/ray.io_rayclusters.yaml b/ray-operator/config/crd/bases/ray.io_rayclusters.yaml index 478a7933f09..7b9e713991c 100644 --- a/ray-operator/config/crd/bases/ray.io_rayclusters.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayclusters.yaml @@ -321,10 +321,53 @@ spec: of {key,value} pairs. type: object type: object + namespaceSelector: + description: A label query over the + set of namespaces that the term applies + to. + properties: + matchExpressions: + description: matchExpressions is + a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector + requirement is a selector that + contains values, a key, and + an operator that relates + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to + a set of values. + type: string + values: + description: values is an + array of string values. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map + of {key,value} pairs. + type: object + type: object namespaces: - description: 'namespaces specifies which - namespaces the labelSelector applies - to (matches against); null or empty ' + description: namespaces specifies a + static list of namespace names that + the term applies to. items: type: string type: array @@ -397,10 +440,51 @@ spec: {key,value} pairs. type: object type: object + namespaceSelector: + description: A label query over the set + of namespaces that the term applies to. + properties: + matchExpressions: + description: matchExpressions is a list + of label selector requirements. The + requirements are ANDed. + items: + description: A label selector requirement + is a selector that contains values, + a key, and an operator that relates + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to a set + of values. + type: string + values: + description: values is an array + of string values. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of + {key,value} pairs. + type: object + type: object namespaces: - description: 'namespaces specifies which - namespaces the labelSelector applies to - (matches against); null or empty ' + description: namespaces specifies a static + list of namespace names that the term + applies to. items: type: string type: array @@ -473,10 +557,53 @@ spec: of {key,value} pairs. type: object type: object + namespaceSelector: + description: A label query over the + set of namespaces that the term applies + to. + properties: + matchExpressions: + description: matchExpressions is + a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector + requirement is a selector that + contains values, a key, and + an operator that relates + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to + a set of values. + type: string + values: + description: values is an + array of string values. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map + of {key,value} pairs. + type: object + type: object namespaces: - description: 'namespaces specifies which - namespaces the labelSelector applies - to (matches against); null or empty ' + description: namespaces specifies a + static list of namespace names that + the term applies to. items: type: string type: array @@ -549,10 +676,51 @@ spec: {key,value} pairs. type: object type: object + namespaceSelector: + description: A label query over the set + of namespaces that the term applies to. + properties: + matchExpressions: + description: matchExpressions is a list + of label selector requirements. The + requirements are ANDed. + items: + description: A label selector requirement + is a selector that contains values, + a key, and an operator that relates + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to a set + of values. + type: string + values: + description: values is an array + of string values. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of + {key,value} pairs. + type: object + type: object namespaces: - description: 'namespaces specifies which - namespaces the labelSelector applies to - (matches against); null or empty ' + description: namespaces specifies a static + list of namespace names that the term + applies to. items: type: string type: array @@ -603,8 +771,8 @@ spec: type: string value: description: Variable references $(VAR_NAME) - are expanded using the previous defined - environment variables in the + are expanded using the previously defined + environment variables in t type: string valueFrom: description: Source for the environment variable's @@ -749,9 +917,8 @@ spec: after a container is created. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command @@ -811,8 +978,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name to @@ -836,9 +1002,8 @@ spec: or management e properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command @@ -898,8 +1063,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name to @@ -923,9 +1087,7 @@ spec: Container will be restarted if the probe fails. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the action - to take. + description: Exec specifies the action to take. properties: command: description: 'Command is the command line @@ -941,6 +1103,23 @@ spec: succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -1022,6 +1201,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. Minimum @@ -1077,9 +1262,7 @@ spec: readiness. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the action - to take. + description: Exec specifies the action to take. properties: command: description: 'Command is the command line @@ -1095,6 +1278,23 @@ spec: succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -1176,6 +1376,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. Minimum @@ -1209,8 +1415,8 @@ spec: type: object type: object securityContext: - description: 'Security options the pod should run - with. More info: https://kubernetes.' + description: SecurityContext defines the security + options the container should be run with. properties: allowPrivilegeEscalation: description: AllowPrivilegeEscalation controls @@ -1311,6 +1517,11 @@ spec: description: GMSACredentialSpecName is the name of the GMSA credential spec to use. type: string + hostProcess: + description: HostProcess determines if a + container should be run as a 'Host Process' + container. + type: boolean runAsUserName: description: The UserName in Windows to run the entrypoint of the container process. @@ -1322,9 +1533,7 @@ spec: has successfully initialized. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the action - to take. + description: Exec specifies the action to take. properties: command: description: 'Command is the command line @@ -1340,6 +1549,23 @@ spec: succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -1421,6 +1647,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. Minimum @@ -1560,8 +1792,8 @@ spec: description: List of ephemeral containers run in this pod. items: - description: An EphemeralContainer is a container that - may be added temporarily to an existing pod for user-initi + description: An EphemeralContainer is a temporary container + that you may add to an existing Pod for user-initiate properties: args: description: Arguments to the entrypoint. The docker @@ -1588,8 +1820,8 @@ spec: type: string value: description: Variable references $(VAR_NAME) - are expanded using the previous defined - environment variables in the + are expanded using the previously defined + environment variables in t type: string valueFrom: description: Source for the environment variable's @@ -1733,9 +1965,8 @@ spec: after a container is created. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command @@ -1795,8 +2026,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name to @@ -1820,9 +2050,8 @@ spec: or management e properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command @@ -1882,8 +2111,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name to @@ -1907,9 +2135,7 @@ spec: containers. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the action - to take. + description: Exec specifies the action to take. properties: command: description: 'Command is the command line @@ -1925,6 +2151,23 @@ spec: succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -2006,6 +2249,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. Minimum @@ -2053,14 +2302,16 @@ spec: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: Probes are not allowed for ephemeral containers. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the action - to take. + description: Exec specifies the action to take. properties: command: description: 'Command is the command line @@ -2076,6 +2327,23 @@ spec: succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -2157,6 +2425,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. Minimum @@ -2190,8 +2464,9 @@ spec: type: object type: object securityContext: - description: SecurityContext is not allowed for - ephemeral containers. + description: 'Optional: SecurityContext defines + the security options the ephemeral container should + be run with.' properties: allowPrivilegeEscalation: description: AllowPrivilegeEscalation controls @@ -2292,6 +2567,11 @@ spec: description: GMSACredentialSpecName is the name of the GMSA credential spec to use. type: string + hostProcess: + description: HostProcess determines if a + container should be run as a 'Host Process' + container. + type: boolean runAsUserName: description: The UserName in Windows to run the entrypoint of the container process. @@ -2303,9 +2583,7 @@ spec: containers. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the action - to take. + description: Exec specifies the action to take. properties: command: description: 'Command is the command line @@ -2321,6 +2599,23 @@ spec: succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -2402,6 +2697,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. Minimum @@ -2459,7 +2760,7 @@ spec: type: array volumeMounts: description: Pod volumes to mount into the container's - filesystem. Cannot be updated. + filesystem. items: description: VolumeMount describes a mounting of a Volume within a container. @@ -2583,8 +2884,8 @@ spec: type: string value: description: Variable references $(VAR_NAME) - are expanded using the previous defined - environment variables in the + are expanded using the previously defined + environment variables in t type: string valueFrom: description: Source for the environment variable's @@ -2729,9 +3030,8 @@ spec: after a container is created. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command @@ -2791,8 +3091,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name to @@ -2816,9 +3115,8 @@ spec: or management e properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command @@ -2878,8 +3176,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name to @@ -2903,9 +3200,7 @@ spec: Container will be restarted if the probe fails. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the action - to take. + description: Exec specifies the action to take. properties: command: description: 'Command is the command line @@ -2921,6 +3216,23 @@ spec: succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -3002,6 +3314,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. Minimum @@ -3057,9 +3375,7 @@ spec: readiness. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the action - to take. + description: Exec specifies the action to take. properties: command: description: 'Command is the command line @@ -3075,6 +3391,23 @@ spec: succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -3156,6 +3489,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. Minimum @@ -3189,8 +3528,8 @@ spec: type: object type: object securityContext: - description: 'Security options the pod should run - with. More info: https://kubernetes.' + description: SecurityContext defines the security + options the container should be run with. properties: allowPrivilegeEscalation: description: AllowPrivilegeEscalation controls @@ -3291,6 +3630,11 @@ spec: description: GMSACredentialSpecName is the name of the GMSA credential spec to use. type: string + hostProcess: + description: HostProcess determines if a + container should be run as a 'Host Process' + container. + type: boolean runAsUserName: description: The UserName in Windows to run the entrypoint of the container process. @@ -3302,9 +3646,7 @@ spec: has successfully initialized. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the action - to take. + description: Exec specifies the action to take. properties: command: description: 'Command is the command line @@ -3320,6 +3662,23 @@ spec: succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -3401,6 +3760,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. Minimum @@ -3509,6 +3874,18 @@ spec: description: NodeSelector is a selector which must be true for the pod to fit on a node. type: object + x-kubernetes-map-type: atomic + os: + description: Specifies the OS of the containers in the + pod. + properties: + name: + description: Name is the name of the operating system. + The currently supported values are linux and windows. + type: string + required: + - name + type: object overhead: additionalProperties: anyOf: @@ -3660,6 +4037,10 @@ spec: description: GMSACredentialSpecName is the name of the GMSA credential spec to use. type: string + hostProcess: + description: HostProcess determines if a container + should be run as a 'Host Process' container. + type: boolean runAsUserName: description: The UserName in Windows to run the entrypoint of the container process. @@ -4119,13 +4500,8 @@ spec: type: object ephemeral: description: Ephemeral represents a volume that - is handled by a cluster storage driver (Alpha - feature). + is handled by a cluster storage driver. properties: - readOnly: - description: Specifies a read-only configuration - for the volume. Defaults to false (read/write). - type: boolean volumeClaimTemplate: description: Will be used to create a stand-alone PVC to provision the volume. @@ -4183,10 +4559,31 @@ spec: - kind - name type: object + dataSourceRef: + description: Specifies the object from + which to populate the volume with + data, if a non-empty volume is desired. + properties: + apiGroup: + description: APIGroup is the group + for the resource being referenced. + type: string + kind: + description: Kind is the type of + resource being referenced + type: string + name: + description: Name is the name of + resource being referenced + type: string + required: + - kind + - name + type: object resources: - description: 'Resources represents the + description: Resources represents the minimum resources the volume should - have. More info: https://kubernetes.' + have. properties: limits: additionalProperties: @@ -4772,8 +5169,6 @@ spec: type: object type: object type: array - required: - - sources type: object quobyte: description: Quobyte represents a Quobyte mount @@ -5330,11 +5725,54 @@ spec: map of {key,value} pairs. type: object type: object + namespaceSelector: + description: A label query over the + set of namespaces that the term + applies to. + properties: + matchExpressions: + description: matchExpressions + is a list of label selector + requirements. The requirements + are ANDed. + items: + description: A label selector + requirement is a selector + that contains values, a key, + and an operator that relates + properties: + key: + description: key is the + label key that the selector + applies to. + type: string + operator: + description: operator represents + a key's relationship to + a set of values. + type: string + values: + description: values is an + array of string values. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a + map of {key,value} pairs. + type: object + type: object namespaces: - description: 'namespaces specifies - which namespaces the labelSelector - applies to (matches against); null - or empty ' + description: namespaces specifies + a static list of namespace names + that the term applies to. items: type: string type: array @@ -5407,10 +5845,52 @@ spec: of {key,value} pairs. type: object type: object + namespaceSelector: + description: A label query over the set + of namespaces that the term applies + to. + properties: + matchExpressions: + description: matchExpressions is a + list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement + is a selector that contains values, + a key, and an operator that relates + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to a + set of values. + type: string + values: + description: values is an array + of string values. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map + of {key,value} pairs. + type: object + type: object namespaces: - description: 'namespaces specifies which - namespaces the labelSelector applies - to (matches against); null or empty ' + description: namespaces specifies a static + list of namespace names that the term + applies to. items: type: string type: array @@ -5485,11 +5965,54 @@ spec: map of {key,value} pairs. type: object type: object + namespaceSelector: + description: A label query over the + set of namespaces that the term + applies to. + properties: + matchExpressions: + description: matchExpressions + is a list of label selector + requirements. The requirements + are ANDed. + items: + description: A label selector + requirement is a selector + that contains values, a key, + and an operator that relates + properties: + key: + description: key is the + label key that the selector + applies to. + type: string + operator: + description: operator represents + a key's relationship to + a set of values. + type: string + values: + description: values is an + array of string values. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a + map of {key,value} pairs. + type: object + type: object namespaces: - description: 'namespaces specifies - which namespaces the labelSelector - applies to (matches against); null - or empty ' + description: namespaces specifies + a static list of namespace names + that the term applies to. items: type: string type: array @@ -5562,10 +6085,52 @@ spec: of {key,value} pairs. type: object type: object + namespaceSelector: + description: A label query over the set + of namespaces that the term applies + to. + properties: + matchExpressions: + description: matchExpressions is a + list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement + is a selector that contains values, + a key, and an operator that relates + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to a + set of values. + type: string + values: + description: values is an array + of string values. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map + of {key,value} pairs. + type: object + type: object namespaces: - description: 'namespaces specifies which - namespaces the labelSelector applies - to (matches against); null or empty ' + description: namespaces specifies a static + list of namespace names that the term + applies to. items: type: string type: array @@ -5617,8 +6182,8 @@ spec: type: string value: description: Variable references $(VAR_NAME) - are expanded using the previous defined - environment variables in the + are expanded using the previously defined + environment variables in t type: string valueFrom: description: Source for the environment @@ -5765,9 +6330,8 @@ spec: after a container is created. properties: exec: - description: One and only one of the following - should be specified. Exec specifies - the action to take. + description: Exec specifies the action + to take. properties: command: description: 'Command is the command @@ -5828,8 +6392,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name @@ -5855,9 +6418,8 @@ spec: an API request or management e properties: exec: - description: One and only one of the following - should be specified. Exec specifies - the action to take. + description: Exec specifies the action + to take. properties: command: description: 'Command is the command @@ -5918,8 +6480,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name @@ -5945,9 +6506,8 @@ spec: Container will be restarted if the probe fails. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command line @@ -5963,6 +6523,23 @@ spec: having succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -6044,6 +6621,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully upon + probe failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. @@ -6101,9 +6684,8 @@ spec: readiness. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command line @@ -6119,6 +6701,23 @@ spec: having succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -6200,6 +6799,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully upon + probe failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. @@ -6234,8 +6839,8 @@ spec: type: object type: object securityContext: - description: 'Security options the pod should - run with. More info: https://kubernetes.' + description: SecurityContext defines the security + options the container should be run with. properties: allowPrivilegeEscalation: description: AllowPrivilegeEscalation controls @@ -6337,6 +6942,11 @@ spec: the name of the GMSA credential spec to use. type: string + hostProcess: + description: HostProcess determines if + a container should be run as a 'Host + Process' container. + type: boolean runAsUserName: description: The UserName in Windows to run the entrypoint of the container @@ -6349,9 +6959,8 @@ spec: has successfully initialized. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command line @@ -6367,6 +6976,23 @@ spec: having succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -6448,6 +7074,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully upon + probe failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. @@ -6589,9 +7221,9 @@ spec: description: List of ephemeral containers run in this pod. items: - description: An EphemeralContainer is a container - that may be added temporarily to an existing pod - for user-initi + description: An EphemeralContainer is a temporary + container that you may add to an existing Pod for + user-initiate properties: args: description: Arguments to the entrypoint. The @@ -6618,8 +7250,8 @@ spec: type: string value: description: Variable references $(VAR_NAME) - are expanded using the previous defined - environment variables in the + are expanded using the previously defined + environment variables in t type: string valueFrom: description: Source for the environment @@ -6765,9 +7397,8 @@ spec: after a container is created. properties: exec: - description: One and only one of the following - should be specified. Exec specifies - the action to take. + description: Exec specifies the action + to take. properties: command: description: 'Command is the command @@ -6828,8 +7459,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name @@ -6855,9 +7485,8 @@ spec: an API request or management e properties: exec: - description: One and only one of the following - should be specified. Exec specifies - the action to take. + description: Exec specifies the action + to take. properties: command: description: 'Command is the command @@ -6918,8 +7547,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name @@ -6945,9 +7573,8 @@ spec: containers. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command line @@ -6963,6 +7590,23 @@ spec: having succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -7044,6 +7688,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully upon + probe failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. @@ -7092,14 +7742,17 @@ spec: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: Probes are not allowed for ephemeral containers. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command line @@ -7115,6 +7768,23 @@ spec: having succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -7196,6 +7866,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully upon + probe failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. @@ -7230,8 +7906,9 @@ spec: type: object type: object securityContext: - description: SecurityContext is not allowed for - ephemeral containers. + description: 'Optional: SecurityContext defines + the security options the ephemeral container + should be run with.' properties: allowPrivilegeEscalation: description: AllowPrivilegeEscalation controls @@ -7333,6 +8010,11 @@ spec: the name of the GMSA credential spec to use. type: string + hostProcess: + description: HostProcess determines if + a container should be run as a 'Host + Process' container. + type: boolean runAsUserName: description: The UserName in Windows to run the entrypoint of the container @@ -7345,9 +8027,8 @@ spec: containers. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command line @@ -7363,6 +8044,23 @@ spec: having succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -7444,6 +8142,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully upon + probe failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. @@ -7501,7 +8205,7 @@ spec: type: array volumeMounts: description: Pod volumes to mount into the container's - filesystem. Cannot be updated. + filesystem. items: description: VolumeMount describes a mounting of a Volume within a container. @@ -7629,8 +8333,8 @@ spec: type: string value: description: Variable references $(VAR_NAME) - are expanded using the previous defined - environment variables in the + are expanded using the previously defined + environment variables in t type: string valueFrom: description: Source for the environment @@ -7777,9 +8481,8 @@ spec: after a container is created. properties: exec: - description: One and only one of the following - should be specified. Exec specifies - the action to take. + description: Exec specifies the action + to take. properties: command: description: 'Command is the command @@ -7840,8 +8543,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name @@ -7867,9 +8569,8 @@ spec: an API request or management e properties: exec: - description: One and only one of the following - should be specified. Exec specifies - the action to take. + description: Exec specifies the action + to take. properties: command: description: 'Command is the command @@ -7930,8 +8631,7 @@ spec: - port type: object tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. + description: Deprecated. properties: host: description: 'Optional: Host name @@ -7957,9 +8657,8 @@ spec: Container will be restarted if the probe fails. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command line @@ -7975,6 +8674,23 @@ spec: having succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -8056,6 +8772,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully upon + probe failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. @@ -8113,9 +8835,8 @@ spec: readiness. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command line @@ -8131,6 +8852,23 @@ spec: having succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -8212,6 +8950,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully upon + probe failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. @@ -8246,8 +8990,8 @@ spec: type: object type: object securityContext: - description: 'Security options the pod should - run with. More info: https://kubernetes.' + description: SecurityContext defines the security + options the container should be run with. properties: allowPrivilegeEscalation: description: AllowPrivilegeEscalation controls @@ -8349,6 +9093,11 @@ spec: the name of the GMSA credential spec to use. type: string + hostProcess: + description: HostProcess determines if + a container should be run as a 'Host + Process' container. + type: boolean runAsUserName: description: The UserName in Windows to run the entrypoint of the container @@ -8361,9 +9110,8 @@ spec: has successfully initialized. properties: exec: - description: One and only one of the following - should be specified. Exec specifies the - action to take. + description: Exec specifies the action to + take. properties: command: description: 'Command is the command line @@ -8379,6 +9127,23 @@ spec: having succeeded. format: int32 type: integer + grpc: + description: GRPC specifies an action involving + a GRPC port. + properties: + port: + description: Port number of the gRPC service. + Number must be in the range 1 to 65535. + format: int32 + type: integer + service: + description: Service is the name of the + service to place in the gRPC HealthCheckRequest + (see https://github. + type: string + required: + - port + type: object httpGet: description: HTTPGet specifies the http request to perform. @@ -8460,6 +9225,12 @@ spec: required: - port type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds + the pod needs to terminate gracefully upon + probe failure. + format: int64 + type: integer timeoutSeconds: description: Number of seconds after which the probe times out. Defaults to 1 second. @@ -8569,6 +9340,18 @@ spec: description: NodeSelector is a selector which must be true for the pod to fit on a node. type: object + x-kubernetes-map-type: atomic + os: + description: Specifies the OS of the containers in the + pod. + properties: + name: + description: Name is the name of the operating system. + The currently supported values are linux and windows. + type: string + required: + - name + type: object overhead: additionalProperties: anyOf: @@ -8720,6 +9503,10 @@ spec: description: GMSACredentialSpecName is the name of the GMSA credential spec to use. type: string + hostProcess: + description: HostProcess determines if a container + should be run as a 'Host Process' container. + type: boolean runAsUserName: description: The UserName in Windows to run the entrypoint of the container process. @@ -9190,13 +9977,8 @@ spec: type: object ephemeral: description: Ephemeral represents a volume that - is handled by a cluster storage driver (Alpha - feature). + is handled by a cluster storage driver. properties: - readOnly: - description: Specifies a read-only configuration - for the volume. Defaults to false (read/write). - type: boolean volumeClaimTemplate: description: Will be used to create a stand-alone PVC to provision the volume. @@ -9255,10 +10037,32 @@ spec: - kind - name type: object + dataSourceRef: + description: Specifies the object + from which to populate the volume + with data, if a non-empty volume + is desired. + properties: + apiGroup: + description: APIGroup is the group + for the resource being referenced. + type: string + kind: + description: Kind is the type + of resource being referenced + type: string + name: + description: Name is the name + of resource being referenced + type: string + required: + - kind + - name + type: object resources: - description: 'Resources represents + description: Resources represents the minimum resources the volume - should have. More info: https://kubernetes.' + should have. properties: limits: additionalProperties: @@ -9857,8 +10661,6 @@ spec: type: object type: object type: array - required: - - sources type: object quobyte: description: Quobyte represents a Quobyte mount From 77e77dfd0cdafa79aea5985f3903f06659f31888 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Thu, 19 May 2022 20:13:51 -0700 Subject: [PATCH 07/36] Remove scratch yaml. --- .../prioritize_workers_to_delete_patch.yaml | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.yaml diff --git a/manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.yaml b/manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.yaml deleted file mode 100644 index 1381f2d1ab6..00000000000 --- a/manifests/overlays/autoscaling/prioritize_workers_to_delete_patch.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: kuberay-operator - namespace: system -spec: - template: - spec: - containers: - - args: --prioritize-workers-to-delete - - type: NodePort From e63c2ce738b12c4ca2463b1b693221aaa9d19261 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Thu, 19 May 2022 20:18:12 -0700 Subject: [PATCH 08/36] Remove redis password logic from test. --- ray-operator/controllers/common/pod_test.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/ray-operator/controllers/common/pod_test.go b/ray-operator/controllers/common/pod_test.go index 123dc5e5e94..7de6950744e 100644 --- a/ray-operator/controllers/common/pod_test.go +++ b/ray-operator/controllers/common/pod_test.go @@ -226,14 +226,6 @@ func TestBuildPod_WithAutoscalerEnabled(t *testing.T) { t.Fatalf("Expected `%v` in `%v` but doesn't have the config", expectedResult, pod.Spec.Containers[0].Args[0]) } - actualResult = cluster.Spec.HeadGroupSpec.RayStartParams["redis-password"] - targetContainer, err := utils.FilterContainerByName(pod.Spec.Containers, "autoscaler") - if err != nil { - t.Fatalf("error: %v", err) - } - if !utils.Contains(targetContainer.Args, actualResult) { - t.Fatalf("Expected redis password `%v` in `%v` but not found", targetContainer.Args, actualResult) - } } func TestDefaultHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) { From 1643dee2cf92194574db78a5d031410f4798ed75 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Thu, 19 May 2022 20:52:57 -0700 Subject: [PATCH 09/36] Add namespaces. --- manifests/base/kustomization.yaml | 1 - manifests/overlays/autoscaling/kustomization.yaml | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index 3b8d1c3b188..ecf1772cce5 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -1,6 +1,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization - namespace: ray-system resources: diff --git a/manifests/overlays/autoscaling/kustomization.yaml b/manifests/overlays/autoscaling/kustomization.yaml index e18c10de282..7e35a4ddc5e 100644 --- a/manifests/overlays/autoscaling/kustomization.yaml +++ b/manifests/overlays/autoscaling/kustomization.yaml @@ -1,5 +1,9 @@ # This overlay patches in KubeRay operator configuration # necessary for Ray Autoscaler support. +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: ray-system + bases: - ../../base patches: From abdaac5e11b281fe8a85ea1347ac79d9eda8a72b Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Thu, 19 May 2022 21:10:37 -0700 Subject: [PATCH 10/36] Fix kustomization. --- manifests/base/kustomization.yaml | 1 + manifests/overlays/autoscaling/kustomization.yaml | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index ecf1772cce5..3b8d1c3b188 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -1,5 +1,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization + namespace: ray-system resources: diff --git a/manifests/overlays/autoscaling/kustomization.yaml b/manifests/overlays/autoscaling/kustomization.yaml index 7e35a4ddc5e..2d70257c001 100644 --- a/manifests/overlays/autoscaling/kustomization.yaml +++ b/manifests/overlays/autoscaling/kustomization.yaml @@ -2,7 +2,6 @@ # necessary for Ray Autoscaler support. apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -namespace: ray-system bases: - ../../base From dd587bfb97c8e3941f6b8cf8b4d7a44927ceedaf Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Thu, 19 May 2022 23:53:43 -0700 Subject: [PATCH 11/36] Log if the feature flag is enabled. --- ray-operator/main.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ray-operator/main.go b/ray-operator/main.go index 63d377fc5b0..0fbd773f767 100644 --- a/ray-operator/main.go +++ b/ray-operator/main.go @@ -70,6 +70,9 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) setupLog.Info("the operator", "version:", os.Getenv("OPERATOR_VERSION")) + if controllers.PrioritizeWorkersToDelete { + setupLog.Info("Feature flag prioritize-workers-to-delete is enabled.") + } mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: scheme, From 1c7c6bd23f0b631a065eebaef9fcd0c24ff3c535 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Fri, 20 May 2022 00:29:38 -0700 Subject: [PATCH 12/36] Fix entrypoint. --- ray-operator/controllers/common/pod.go | 5 ++--- ray-operator/controllers/common/pod_test.go | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ray-operator/controllers/common/pod.go b/ray-operator/controllers/common/pod.go index bf44e49de10..4af87c40353 100644 --- a/ray-operator/controllers/common/pod.go +++ b/ray-operator/controllers/common/pod.go @@ -150,7 +150,7 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN return pod } -// BuildAutoscalerContainer build a ray autoscaler container which can be appended to head pod. +// BuildAutoscalerContainer builds a Ray autoscaler container which can be appended to the head pod. func BuildAutoscalerContainer() v1.Container { container := v1.Container{ Name: "autoscaler", @@ -177,10 +177,9 @@ func BuildAutoscalerContainer() v1.Container { }, }, Command: []string{ - "/home/ray/anaconda3/bin/python", + "ray", }, Args: []string{ - "ray", "kuberay-autoscaler", "--cluster-name", "$(RAY_CLUSTER_NAME)", diff --git a/ray-operator/controllers/common/pod_test.go b/ray-operator/controllers/common/pod_test.go index 7de6950744e..f459e0e265e 100644 --- a/ray-operator/controllers/common/pod_test.go +++ b/ray-operator/controllers/common/pod_test.go @@ -130,10 +130,9 @@ var autoscalerContainer = v1.Container{ }, }, Command: []string{ - "/home/ray/anaconda3/bin/python", + "ray", }, Args: []string{ - "ray", "kuberay-autoscaler", "--cluster-name", "$(RAY_CLUSTER_NAME)", From fe8619aea17b34e2c5cbce72d6986caccec17458 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Mon, 23 May 2022 17:44:06 -0700 Subject: [PATCH 13/36] Autoscaler logs volume mount. --- ray-operator/controllers/common/pod.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ray-operator/controllers/common/pod.go b/ray-operator/controllers/common/pod.go index 4af87c40353..bfa81b500d1 100644 --- a/ray-operator/controllers/common/pod.go +++ b/ray-operator/controllers/common/pod.go @@ -50,6 +50,7 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a podTemplate.Spec.Containers = append(podTemplate.Spec.Containers, container) // set custom service account which can be authorized to talk with apiserver podTemplate.Spec.ServiceAccountName = instance.Name + } // add metrics port for exposing to the promethues stack. @@ -197,6 +198,13 @@ func BuildAutoscalerContainer() v1.Container { v1.ResourceMemory: resource.MustParse("256Mi"), }, }, + // Needed to allow the Ray driver to pick up autoscaler events. + VolumeMounts: []v1.VolumeMount{ + { + MountPath: "/tmp/ray", + Name: "ray-logs", + }, + }, } return container } From ca422ae661f07ec493f5d78752bd9e147ba6edf2 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Mon, 23 May 2022 17:54:59 -0700 Subject: [PATCH 14/36] fix-test --- ray-operator/controllers/common/pod_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ray-operator/controllers/common/pod_test.go b/ray-operator/controllers/common/pod_test.go index f459e0e265e..b246aa3a48d 100644 --- a/ray-operator/controllers/common/pod_test.go +++ b/ray-operator/controllers/common/pod_test.go @@ -149,6 +149,12 @@ var autoscalerContainer = v1.Container{ v1.ResourceMemory: resource.MustParse("256Mi"), }, }, + VolumeMounts: []v1.VolumeMount{ + { + MountPath: "/tmp/ray", + Name: "ray-logs", + }, + }, } var trueFlag = true From cfea869a672833ff75cbfd54b32e87795ab30585 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Mon, 23 May 2022 19:28:41 -0700 Subject: [PATCH 15/36] Add Ray log volume mount when autoscaling. --- ray-operator/controllers/common/pod.go | 40 +++++++++++-------- ray-operator/controllers/common/pod_test.go | 6 ++- .../controllers/raycluster_controller.go | 2 +- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/ray-operator/controllers/common/pod.go b/ray-operator/controllers/common/pod.go index bfa81b500d1..8160370311c 100644 --- a/ray-operator/controllers/common/pod.go +++ b/ray-operator/controllers/common/pod.go @@ -20,6 +20,8 @@ import ( const ( SharedMemoryVolumeName = "shared-mem" SharedMemoryVolumeMountPath = "/dev/shm" + RayLogVolumeName = "ray-logs" + RayLogVolumeMountPath = "/tmp/ray" ) var log = logf.Log.WithName("RayCluster-Controller") @@ -50,7 +52,6 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a podTemplate.Spec.Containers = append(podTemplate.Spec.Containers, container) // set custom service account which can be authorized to talk with apiserver podTemplate.Spec.ServiceAccountName = instance.Name - } // add metrics port for exposing to the promethues stack. @@ -100,7 +101,7 @@ func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayi } // BuildPod a pod config -func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayNodeType, rayStartParams map[string]string, svcName string) (aPod v1.Pod) { +func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayNodeType, rayStartParams map[string]string, svcName string, enableRayAutoscaler *bool) (aPod v1.Pod) { pod := v1.Pod{ TypeMeta: metav1.TypeMeta{ APIVersion: "v1", @@ -111,7 +112,12 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN } index := getRayContainerIndex(pod) - addEmptyDir(&pod.Spec.Containers[index], &pod) + //Add /dev/shm volumeMount for the object store to avoid performance degradation. + addEmptyDir(&pod.Spec.Containers[index], &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath) + if rayNodeType == rayiov1alpha1.HeadNode && enableRayAutoscaler != nil && *enableRayAutoscaler { + //The Ray autoscaler communicates with the Ray head via a shared log directory. + addEmptyDir(&pod.Spec.Containers[index], &pod, RayLogVolumeName, RayLogVolumeMountPath) + } cleanupInvalidVolumeMounts(&pod.Spec.Containers[index], &pod) if len(pod.Spec.InitContainers) > index { cleanupInvalidVolumeMounts(&pod.Spec.InitContainers[index], &pod) @@ -201,8 +207,8 @@ func BuildAutoscalerContainer() v1.Container { // Needed to allow the Ray driver to pick up autoscaler events. VolumeMounts: []v1.VolumeMount{ { - MountPath: "/tmp/ray", - Name: "ray-logs", + MountPath: RayLogVolumeMountPath, + Name: RayLogVolumeName, }, }, } @@ -256,7 +262,7 @@ func labelPod(rayNodeType rayiov1alpha1.RayNodeType, rayClusterName string, grou for k, v := range ret { if k == string(rayNodeType) { - // overriding invalide values for this label + // overriding invalid values for this label if v != string(rayiov1alpha1.HeadNode) && v != string(rayiov1alpha1.WorkerNode) { labels[k] = v } @@ -410,13 +416,13 @@ func convertParamMap(rayStartParams map[string]string) (s string) { // addEmptyDir add an emptyDir to the shared memory mount point /dev/shm // this is to avoid: "The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance!..."" -func addEmptyDir(container *v1.Container, pod *v1.Pod) { - if checkIfVolumeMounted(container, pod) { +func addEmptyDir(container *v1.Container, pod *v1.Pod, volumeName string, volumeMountPath string) { + if checkIfVolumeMounted(container, pod, volumeMountPath) { return } // 1) create a Volume of type emptyDir and add it to Volumes emptyDirVolume := v1.Volume{ - Name: SharedMemoryVolumeName, + Name: volumeName, VolumeSource: v1.VolumeSource{ EmptyDir: &v1.EmptyDirVolumeSource{ Medium: v1.StorageMediumMemory, @@ -424,24 +430,24 @@ func addEmptyDir(container *v1.Container, pod *v1.Pod) { }, }, } - if !checkIfVolumeMounted(container, pod) { - pod.Spec.Volumes = append(pod.Spec.Volumes, emptyDirVolume) - } + pod.Spec.Volumes = append(pod.Spec.Volumes, emptyDirVolume) // 2) create a VolumeMount that uses the emptyDir mountedVolume := v1.VolumeMount{ - MountPath: SharedMemoryVolumeMountPath, - Name: SharedMemoryVolumeName, + MountPath: volumeMountPath, + Name: volumeName, ReadOnly: false, } - if !checkIfVolumeMounted(container, pod) { + if !checkIfVolumeMounted(container, pod, volumeMountPath) { container.VolumeMounts = append(container.VolumeMounts, mountedVolume) } } -func checkIfVolumeMounted(container *v1.Container, pod *v1.Pod) bool { +//Checks if the container has a volumeMount with the given mount path and if +//the pod has a matching Volume. +func checkIfVolumeMounted(container *v1.Container, pod *v1.Pod, volumeMountPath string) bool { for _, mountedVol := range container.VolumeMounts { - if mountedVol.MountPath == SharedMemoryVolumeMountPath { + if mountedVol.MountPath == volumeMountPath { for _, podVolume := range pod.Spec.Volumes { if mountedVol.Name == podVolume.Name { // already mounted, nothing to do diff --git a/ray-operator/controllers/common/pod_test.go b/ray-operator/controllers/common/pod_test.go index b246aa3a48d..f0924eca134 100644 --- a/ray-operator/controllers/common/pod_test.go +++ b/ray-operator/controllers/common/pod_test.go @@ -161,10 +161,12 @@ var trueFlag = true func TestBuildPod(t *testing.T) { cluster := instance.DeepCopy() + + // Test head pod podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0)) svcName := utils.GenerateServiceName(cluster.Name) podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName) - pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, svcName) + pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, svcName, nil) actualResult := pod.Labels[RayClusterLabelKey] expectedResult := cluster.Name @@ -182,6 +184,8 @@ func TestBuildPod(t *testing.T) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } + // Test head pod with autoscaling enabled. + // testing worker pod worker := cluster.Spec.WorkerGroupSpecs[0] podName = cluster.Name + DashSymbol + string(rayiov1alpha1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) diff --git a/ray-operator/controllers/raycluster_controller.go b/ray-operator/controllers/raycluster_controller.go index 4d7ce60adea..44bfc8f5b66 100644 --- a/ray-operator/controllers/raycluster_controller.go +++ b/ray-operator/controllers/raycluster_controller.go @@ -495,7 +495,7 @@ func (r *RayClusterReconciler) buildHeadPod(instance rayiov1alpha1.RayCluster) c podName = utils.CheckName(podName) // making sure the name is valid svcName := utils.GenerateServiceName(instance.Name) podConf := common.DefaultHeadPodTemplate(instance, instance.Spec.HeadGroupSpec, podName, svcName) - pod := common.BuildPod(podConf, rayiov1alpha1.HeadNode, instance.Spec.HeadGroupSpec.RayStartParams, svcName) + pod := common.BuildPod(podConf, rayiov1alpha1.HeadNode, instance.Spec.HeadGroupSpec.RayStartParams, svcName, instance.Spec.EnableInTreeAutoscaling) // Set raycluster instance as the owner and controller if err := controllerutil.SetControllerReference(&instance, &pod, r.Scheme); err != nil { log.Error(err, "Failed to set controller reference for raycluster pod") From 16bafbde4d0bd45a7d548e6564e9be3fef287ee0 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Mon, 23 May 2022 19:30:24 -0700 Subject: [PATCH 16/36] Fix BuildPod. --- ray-operator/controllers/common/pod_test.go | 2 +- ray-operator/controllers/raycluster_controller.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ray-operator/controllers/common/pod_test.go b/ray-operator/controllers/common/pod_test.go index f0924eca134..a2593eea2dc 100644 --- a/ray-operator/controllers/common/pod_test.go +++ b/ray-operator/controllers/common/pod_test.go @@ -190,7 +190,7 @@ func TestBuildPod(t *testing.T) { worker := cluster.Spec.WorkerGroupSpecs[0] podName = cluster.Name + DashSymbol + string(rayiov1alpha1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) podTemplateSpec = DefaultWorkerPodTemplate(*cluster, worker, podName, svcName) - pod = BuildPod(podTemplateSpec, rayiov1alpha1.WorkerNode, worker.RayStartParams, svcName) + pod = BuildPod(podTemplateSpec, rayiov1alpha1.WorkerNode, worker.RayStartParams, svcName, nil) expectedResult = fmt.Sprintf("%s:6379", svcName) actualResult = cluster.Spec.WorkerGroupSpecs[0].RayStartParams["address"] diff --git a/ray-operator/controllers/raycluster_controller.go b/ray-operator/controllers/raycluster_controller.go index 44bfc8f5b66..e4b71aac419 100644 --- a/ray-operator/controllers/raycluster_controller.go +++ b/ray-operator/controllers/raycluster_controller.go @@ -510,7 +510,7 @@ func (r *RayClusterReconciler) buildWorkerPod(instance rayiov1alpha1.RayCluster, podName = utils.CheckName(podName) // making sure the name is valid svcName := utils.GenerateServiceName(instance.Name) podTemplateSpec := common.DefaultWorkerPodTemplate(instance, worker, podName, svcName) - pod := common.BuildPod(podTemplateSpec, rayiov1alpha1.WorkerNode, worker.RayStartParams, svcName) + pod := common.BuildPod(podTemplateSpec, rayiov1alpha1.WorkerNode, worker.RayStartParams, svcName, instance.Spec.EnableInTreeAutoscaling) // Set raycluster instance as the owner and controller if err := controllerutil.SetControllerReference(&instance, &pod, r.Scheme); err != nil { log.Error(err, "Failed to set controller reference for raycluster pod") From 6658b8c43f51b13e33e65a8fcd1ac2b692d94752 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Mon, 23 May 2022 19:36:36 -0700 Subject: [PATCH 17/36] fix --- ray-operator/controllers/common/pod_test.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ray-operator/controllers/common/pod_test.go b/ray-operator/controllers/common/pod_test.go index a2593eea2dc..3993c861b56 100644 --- a/ray-operator/controllers/common/pod_test.go +++ b/ray-operator/controllers/common/pod_test.go @@ -184,8 +184,6 @@ func TestBuildPod(t *testing.T) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } - // Test head pod with autoscaling enabled. - // testing worker pod worker := cluster.Spec.WorkerGroupSpecs[0] podName = cluster.Name + DashSymbol + string(rayiov1alpha1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) @@ -211,7 +209,7 @@ func TestBuildPod_WithAutoscalerEnabled(t *testing.T) { podName := strings.ToLower(cluster.Name + DashSymbol + string(rayiov1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0)) svcName := utils.GenerateServiceName(cluster.Name) podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, svcName) - pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, svcName) + pod := BuildPod(podTemplateSpec, rayiov1alpha1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, svcName, &trueFlag) actualResult := pod.Labels[RayClusterLabelKey] expectedResult := cluster.Name From 88848ff299020798b07b10dfc970bffc7e7853ef Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Mon, 23 May 2022 21:24:01 -0700 Subject: [PATCH 18/36] Add an emptyDir volume functions. --- ray-operator/controllers/common/pod.go | 39 ++++++++++++++------- ray-operator/controllers/common/pod_test.go | 7 ++++ 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/ray-operator/controllers/common/pod.go b/ray-operator/controllers/common/pod.go index 8160370311c..5591e5fe9d9 100644 --- a/ray-operator/controllers/common/pod.go +++ b/ray-operator/controllers/common/pod.go @@ -113,10 +113,10 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN index := getRayContainerIndex(pod) //Add /dev/shm volumeMount for the object store to avoid performance degradation. - addEmptyDir(&pod.Spec.Containers[index], &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath) + addEmptyDir(&pod.Spec.Containers[index], &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath, v1.StorageMediumMemory) if rayNodeType == rayiov1alpha1.HeadNode && enableRayAutoscaler != nil && *enableRayAutoscaler { //The Ray autoscaler communicates with the Ray head via a shared log directory. - addEmptyDir(&pod.Spec.Containers[index], &pod, RayLogVolumeName, RayLogVolumeMountPath) + addEmptyDir(&pod.Spec.Containers[index], &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) } cleanupInvalidVolumeMounts(&pod.Spec.Containers[index], &pod) if len(pod.Spec.InitContainers) > index { @@ -416,20 +416,12 @@ func convertParamMap(rayStartParams map[string]string) (s string) { // addEmptyDir add an emptyDir to the shared memory mount point /dev/shm // this is to avoid: "The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance!..."" -func addEmptyDir(container *v1.Container, pod *v1.Pod, volumeName string, volumeMountPath string) { +func addEmptyDir(container *v1.Container, pod *v1.Pod, volumeName string, volumeMountPath string, storageMedium v1.StorageMedium) { if checkIfVolumeMounted(container, pod, volumeMountPath) { return } // 1) create a Volume of type emptyDir and add it to Volumes - emptyDirVolume := v1.Volume{ - Name: volumeName, - VolumeSource: v1.VolumeSource{ - EmptyDir: &v1.EmptyDirVolumeSource{ - Medium: v1.StorageMediumMemory, - SizeLimit: findMemoryReqOrLimit(*container), - }, - }, - } + emptyDirVolume := makeEmptyDirVolume(container, volumeName, storageMedium) pod.Spec.Volumes = append(pod.Spec.Volumes, emptyDirVolume) // 2) create a VolumeMount that uses the emptyDir @@ -443,6 +435,29 @@ func addEmptyDir(container *v1.Container, pod *v1.Pod, volumeName string, volume } } +//Format an emptyDir volume. +//When the storage medium is memory, set the size limit based on container resources. +//For others media , don't set a size limit. +func makeEmptyDirVolume(container *v1.Container, volumeName string, storageMedium v1.StorageMedium) v1.Volume { + var sizeLimit *resource.Quantity + if storageMedium == v1.StorageMediumMemory { + //If using memory, set size limit based on primary container's resources. + sizeLimit = findMemoryReqOrLimit(*container) + } else { + //Otherwise, don't set a limit. + sizeLimit = nil + } + return v1.Volume{ + Name: volumeName, + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + Medium: storageMedium, + SizeLimit: sizeLimit, + }, + }, + } +} + //Checks if the container has a volumeMount with the given mount path and if //the pod has a matching Volume. func checkIfVolumeMounted(container *v1.Container, pod *v1.Pod, volumeMountPath string) bool { diff --git a/ray-operator/controllers/common/pod_test.go b/ray-operator/controllers/common/pod_test.go index 3993c861b56..1b319ff8d40 100644 --- a/ray-operator/controllers/common/pod_test.go +++ b/ray-operator/controllers/common/pod_test.go @@ -107,6 +107,13 @@ var instance = rayiov1alpha1.RayCluster{ }, } +var volumesNoAutoscaler = []v1.Volume{ + { + Name: "shared-mem", + VolumeSource: v1. + } +} + var autoscalerContainer = v1.Container{ Name: "autoscaler", Image: "rayproject/ray:448f52", From 0dfa759f9746ef71c6f4f54d1cd8139288e8ad18 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Mon, 23 May 2022 22:22:26 -0700 Subject: [PATCH 19/36] Add resources to test instance. --- ray-operator/controllers/common/pod_test.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/common/pod_test.go b/ray-operator/controllers/common/pod_test.go index 1b319ff8d40..19f5d20d937 100644 --- a/ray-operator/controllers/common/pod_test.go +++ b/ray-operator/controllers/common/pod_test.go @@ -47,7 +47,7 @@ var instance = rayiov1alpha1.RayCluster{ Containers: []v1.Container{ { Name: "ray-head", - Image: "rayproject/autoscaler", + Image: "rayproject/ray:1.0.0", Env: []v1.EnvVar{ { Name: "MY_POD_IP", @@ -58,6 +58,16 @@ var instance = rayiov1alpha1.RayCluster{ }, }, }, + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("1"), + v1.ResourceMemory: resource.MustParse("1Gi"), + }, + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("1"), + v1.ResourceMemory: resource.MustParse("1Gi"), + }, + }, }, }, }, @@ -110,8 +120,7 @@ var instance = rayiov1alpha1.RayCluster{ var volumesNoAutoscaler = []v1.Volume{ { Name: "shared-mem", - VolumeSource: v1. - } + }, } var autoscalerContainer = v1.Container{ From 91a04e1fb7792275d3a502216d0be942cdc2199b Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Mon, 23 May 2022 22:58:59 -0700 Subject: [PATCH 20/36] Unit test. --- ray-operator/controllers/common/pod_test.go | 77 ++++++++++++++++++++- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/ray-operator/controllers/common/pod_test.go b/ray-operator/controllers/common/pod_test.go index 19f5d20d937..d90d8ebef98 100644 --- a/ray-operator/controllers/common/pod_test.go +++ b/ray-operator/controllers/common/pod_test.go @@ -17,6 +17,8 @@ import ( "k8s.io/utils/pointer" ) +var testMemoryLimit = resource.MustParse("1Gi") + var instance = rayiov1alpha1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ Name: "raycluster-sample", @@ -61,11 +63,11 @@ var instance = rayiov1alpha1.RayCluster{ Resources: v1.ResourceRequirements{ Requests: v1.ResourceList{ v1.ResourceCPU: resource.MustParse("1"), - v1.ResourceMemory: resource.MustParse("1Gi"), + v1.ResourceMemory: testMemoryLimit, }, Limits: v1.ResourceList{ v1.ResourceCPU: resource.MustParse("1"), - v1.ResourceMemory: resource.MustParse("1Gi"), + v1.ResourceMemory: testMemoryLimit, }, }, }, @@ -120,6 +122,53 @@ var instance = rayiov1alpha1.RayCluster{ var volumesNoAutoscaler = []v1.Volume{ { Name: "shared-mem", + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + Medium: v1.StorageMediumMemory, + SizeLimit: &testMemoryLimit, + }, + }, + }, +} + +var volumesWithAutoscaler = []v1.Volume{ + { + Name: "shared-mem", + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + Medium: v1.StorageMediumMemory, + SizeLimit: &testMemoryLimit, + }, + }, + }, + { + Name: "ray-logs", + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + Medium: v1.StorageMediumDefault, + }, + }, + }, +} + +var volumeMountsNoAutoscaler = []v1.VolumeMount{ + { + Name: "shared-mem", + MountPath: "/dev/shm", + ReadOnly: false, + }, +} + +var volumeMountsWithAutoscaler = []v1.VolumeMount{ + { + Name: "shared-mem", + MountPath: "/dev/shm", + ReadOnly: false, + }, + { + Name: "ray-logs", + MountPath: "/tmp/ray", + ReadOnly: false, }, } @@ -200,6 +249,18 @@ func TestBuildPod(t *testing.T) { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } + actualVolumes := pod.Spec.Volumes + expectedVolumes := volumesNoAutoscaler + if !reflect.DeepEqual(actualVolumes, expectedVolumes) { + t.Fatalf("Expected `%v` but got `%v`", actualVolumes, expectedVolumes) + } + + actualVolumeMounts := pod.Spec.Containers[0].VolumeMounts + expectedVolumeMounts := volumeMountsNoAutoscaler + if !reflect.DeepEqual(actualVolumeMounts, expectedVolumeMounts) { + t.Fatalf("Expected `%v` but got `%v`", actualVolumes, expectedVolumes) + } + // testing worker pod worker := cluster.Spec.WorkerGroupSpecs[0] podName = cluster.Name + DashSymbol + string(rayiov1alpha1.WorkerNode) + DashSymbol + worker.GroupName + DashSymbol + utils.FormatInt32(0) @@ -249,6 +310,18 @@ func TestBuildPod_WithAutoscalerEnabled(t *testing.T) { t.Fatalf("Expected `%v` in `%v` but doesn't have the config", expectedResult, pod.Spec.Containers[0].Args[0]) } + actualVolumes := pod.Spec.Volumes + expectedVolumes := volumesWithAutoscaler + if !reflect.DeepEqual(actualVolumes, expectedVolumes) { + t.Fatalf("Expected `%v` but got `%v`", actualVolumes, expectedVolumes) + } + + actualVolumeMounts := pod.Spec.Containers[0].VolumeMounts + expectedVolumeMounts := volumeMountsWithAutoscaler + if !reflect.DeepEqual(actualVolumeMounts, expectedVolumeMounts) { + t.Fatalf("Expected `%v` but got `%v`", actualVolumes, expectedVolumes) + } + } func TestDefaultHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) { From 5bacaef9476eef862e45a38902abf78db90d36a0 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Tue, 24 May 2022 09:39:49 -0700 Subject: [PATCH 21/36] Fix variable name. --- ray-operator/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/main.go b/ray-operator/main.go index aab5fde79fe..918a91f3c6f 100644 --- a/ray-operator/main.go +++ b/ray-operator/main.go @@ -70,7 +70,7 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) setupLog.Info("the operator", "version:", os.Getenv("OPERATOR_VERSION")) - if controllers.PrioritizeWorkersToDelete { + if raycluster.PrioritizeWorkersToDelete { setupLog.Info("Feature flag prioritize-workers-to-delete is enabled.") } From 588e30f026023241252fc5f77fe1db067912c8d0 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Tue, 24 May 2022 09:50:14 -0700 Subject: [PATCH 22/36] apply -> create --- README.md | 4 ++-- docs/deploy/installation.md | 4 ++-- docs/guidance/autoscaler.md | 2 +- docs/notebook/kuberay-on-kind.ipynb | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c1ab43f1878..61d5aa5dcf3 100644 --- a/README.md +++ b/README.md @@ -26,14 +26,14 @@ You can view detailed documentation and guides at [https://ray-project.github.io #### Nightly version ``` -kubectl apply -k "github.com/ray-project/kuberay/manifests/cluster-scope-resources" +kubectl create -k "github.com/ray-project/kuberay/manifests/cluster-scope-resources" kubectl apply -k "github.com/ray-project/kuberay/manifests/base" ``` #### Stable version ``` -kubectl apply -k "github.com/ray-project/kuberay/manifests/cluster-scope-resources?ref=v0.2.0" +kubectl create -k "github.com/ray-project/kuberay/manifests/cluster-scope-resources?ref=v0.2.0" kubectl apply -k "github.com/ray-project/kuberay/manifests/base?ref=v0.2.0" ``` diff --git a/docs/deploy/installation.md b/docs/deploy/installation.md index 77173b67cb0..7ce6648e174 100644 --- a/docs/deploy/installation.md +++ b/docs/deploy/installation.md @@ -3,13 +3,13 @@ #### Nightly version ``` -kubectl apply -k "github.com/ray-project/kuberay/manifests/cluster-scope-resources" +kubectl create -k "github.com/ray-project/kuberay/manifests/cluster-scope-resources" kubectl apply -k "github.com/ray-project/kuberay/manifests/base" ``` #### Stable version ``` -kubectl apply -k "github.com/ray-project/kuberay/manifests/cluster-scope-resources?ref=v0.2.0" +kubectl create -k "github.com/ray-project/kuberay/manifests/cluster-scope-resources?ref=v0.2.0" kubectl apply -k "github.com/ray-project/kuberay/manifests/base?ref=v0.2.0" ``` diff --git a/docs/guidance/autoscaler.md b/docs/guidance/autoscaler.md index 8fc598c6f38..51969ba3730 100644 --- a/docs/guidance/autoscaler.md +++ b/docs/guidance/autoscaler.md @@ -10,7 +10,7 @@ You can follow below steps for a quick deployment. ``` git clone https://github.com/ray-project/kuberay.git cd kuberay -kubectl apply -k manifests/cluster-scope-resources +kubectl create -k manifests/cluster-scope-resources kubectl apply -k manifests/overlays/autoscaling ``` diff --git a/docs/notebook/kuberay-on-kind.ipynb b/docs/notebook/kuberay-on-kind.ipynb index 05f739ec32c..48472990f54 100644 --- a/docs/notebook/kuberay-on-kind.ipynb +++ b/docs/notebook/kuberay-on-kind.ipynb @@ -140,7 +140,7 @@ } ], "source": [ - "!kubectl apply -k \"github.com/ray-project/kuberay/manifests/cluster-scope-resources\"\n", + "!kubectl create -k \"github.com/ray-project/kuberay/manifests/cluster-scope-resources\"\n", "!kubectl apply -k \"github.com/ray-project/kuberay/manifests/base\"" ] }, From 189b1bd3c9864926aeef092f61e1a6998ee2874e Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Tue, 24 May 2022 11:11:55 -0700 Subject: [PATCH 23/36] Doc typos. --- docs/guidance/autoscaler.md | 2 +- ray-operator/controllers/raycluster/common/pod.go | 2 +- ray-operator/controllers/raycluster/common/pod_test.go | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/guidance/autoscaler.md b/docs/guidance/autoscaler.md index 51969ba3730..4f29d69e675 100644 --- a/docs/guidance/autoscaler.md +++ b/docs/guidance/autoscaler.md @@ -75,7 +75,7 @@ Demands: 2. The autoscaler image is `rayproject/ray:448f52` which reflects the latest changes from [Ray PR #24718](https://github.com/ray-project/ray/pull/24718/files) in the master branch. -3. Autoscaling functionality is supported only for Ray version at least as new as 1.11.0. The autoscaler image used +3. Autoscaling functionality is supported only with Ray versions at least as new as 1.11.0. The autoscaler image used is compatible with all Ray versions >= 1.11.0. ### Test autoscaling diff --git a/ray-operator/controllers/raycluster/common/pod.go b/ray-operator/controllers/raycluster/common/pod.go index de1d116f09d..b275f278450 100644 --- a/ray-operator/controllers/raycluster/common/pod.go +++ b/ray-operator/controllers/raycluster/common/pod.go @@ -162,7 +162,7 @@ func BuildAutoscalerContainer() v1.Container { container := v1.Container{ Name: "autoscaler", // TODO: choose right version based on instance.spec.Version - // Current image reflects changes up to https://github.com/ray-project/ray/pull/24718 + // The currently used image reflects changes up to https://github.com/ray-project/ray/pull/24718 Image: "rayproject/ray:448f52", ImagePullPolicy: v1.PullAlways, Env: []v1.EnvVar{ diff --git a/ray-operator/controllers/raycluster/common/pod_test.go b/ray-operator/controllers/raycluster/common/pod_test.go index e3c91cfeffa..79dd7c2bcff 100644 --- a/ray-operator/controllers/raycluster/common/pod_test.go +++ b/ray-operator/controllers/raycluster/common/pod_test.go @@ -25,7 +25,7 @@ var instance = rayiov1alpha1.RayCluster{ Namespace: "default", }, Spec: rayiov1alpha1.RayClusterSpec{ - RayVersion: "1.0.0", + RayVersion: "12.0.1", HeadGroupSpec: rayiov1alpha1.HeadGroupSpec{ ServiceType: "ClusterIP", Replicas: pointer.Int32Ptr(1), @@ -49,7 +49,7 @@ var instance = rayiov1alpha1.RayCluster{ Containers: []v1.Container{ { Name: "ray-head", - Image: "rayproject/ray:1.0.0", + Image: "rayproject/ray:12.0.1", Env: []v1.EnvVar{ { Name: "MY_POD_IP", From 40e15796769e929a753378cc7e669f34b9b718c6 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Wed, 25 May 2022 12:12:12 -0700 Subject: [PATCH 24/36] Update example config. --- ray-operator/config/samples/ray-cluster.autoscaler.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ray-operator/config/samples/ray-cluster.autoscaler.yaml b/ray-operator/config/samples/ray-cluster.autoscaler.yaml index 393b5641da8..061dfd39ea3 100644 --- a/ray-operator/config/samples/ray-cluster.autoscaler.yaml +++ b/ray-operator/config/samples/ray-cluster.autoscaler.yaml @@ -9,6 +9,7 @@ metadata: name: raycluster-autoscaler spec: rayVersion: '1.12.1' + # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 enableInTreeAutoscaling: true ######################headGroupSpecs################################# # head group template and specs, (perhaps 'group' is not needed in the name) @@ -20,7 +21,7 @@ spec: # logical group name, for this called head-group, also can be functional # pod type head or worker # rayNodeType: head # Not needed since it is under the headgroup - # the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ... + # the following params are used to complete the ray start: ray start --head --block --port=6379 ... rayStartParams: # Flag "no-monitor" must be set when running the autoscaler in # a sidecar container. @@ -29,10 +30,10 @@ spec: node-ip-address: $MY_POD_IP # auto-completed as the head pod IP block: 'true' num-cpus: '1' # can be auto-completed from the limits - redis-password: 'LetMeInRay' # Deprecated since Ray 1.11 due to GCS bootstrapping enabled # Use `resources` to optionally specify custom resource annotations for the Ray node. # The value of `resources` is a string-integer mapping. - # Currently, `resources` must be provided in the unfortunate format demonstrated below. + # Currently, `resources` must be provided in the unfortunate format demonstrated below: + # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"' #pod template template: spec: From a54ddd02be7c6a8890bd9482a2491db2566aa663 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Wed, 25 May 2022 19:25:43 -0700 Subject: [PATCH 25/36] Add a comment explaining what the log volume is for. --- ray-operator/controllers/raycluster/common/pod.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ray-operator/controllers/raycluster/common/pod.go b/ray-operator/controllers/raycluster/common/pod.go index b275f278450..6079f824b9a 100644 --- a/ray-operator/controllers/raycluster/common/pod.go +++ b/ray-operator/controllers/raycluster/common/pod.go @@ -116,6 +116,8 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN addEmptyDir(&pod.Spec.Containers[index], &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath, v1.StorageMediumMemory) if rayNodeType == rayiov1alpha1.HeadNode && enableRayAutoscaler != nil && *enableRayAutoscaler { //The Ray autoscaler communicates with the Ray head via a shared log directory. + //Specifically, we need a shared log volume to enable the event-logging functionality + //introduced in https://github.com/ray-project/ray/pull/13434. addEmptyDir(&pod.Spec.Containers[index], &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) } cleanupInvalidVolumeMounts(&pod.Spec.Containers[index], &pod) From 23f631cb0e018f5f06ca0ab57251a796ed428c65 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Wed, 25 May 2022 19:47:59 -0700 Subject: [PATCH 26/36] Document the volume. --- docs/guidance/autoscaler.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/guidance/autoscaler.md b/docs/guidance/autoscaler.md index 4f29d69e675..50e2dad294a 100644 --- a/docs/guidance/autoscaler.md +++ b/docs/guidance/autoscaler.md @@ -64,8 +64,10 @@ Demands: #### Known issues and limitations -1. operator will recognize following setting and automatically inject preconfigured autoscaler container to head pod. - The service account, role, role binding needed by autoscaler will be created by operator out-of-box. +1. The operator will recognize the following setting and automatically inject a preconfigured autoscaler container to the head pod. + The service account, role, and role binding needed by the autoscaler will be created by the operator out-of-box. + The operator will also configure an empty-dir logging volume for the Ray head pod. The volume will be mounted into the Ray and + autoscaler containers; this is necessary to support the event logging introduced in [Ray PR #13434](https://github.com/ray-project/ray/pull/13434). ``` spec: From 7cbdcff0f3846a38ac00ee486b85a9a870217fc8 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Fri, 27 May 2022 14:54:39 -0700 Subject: [PATCH 27/36] container -> pod --- ray-operator/controllers/raycluster/common/pod.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/raycluster/common/pod.go b/ray-operator/controllers/raycluster/common/pod.go index 6079f824b9a..225111ebf6f 100644 --- a/ray-operator/controllers/raycluster/common/pod.go +++ b/ray-operator/controllers/raycluster/common/pod.go @@ -47,7 +47,7 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a // set custom service account with proper roles bound. podTemplate.Spec.ServiceAccountName = utils.GetHeadGroupServiceAccountName(&instance) - // inject autoscaler pod into head pod + // inject autoscaler container into head pod container := BuildAutoscalerContainer() podTemplate.Spec.Containers = append(podTemplate.Spec.Containers, container) // set custom service account which can be authorized to talk with apiserver From e970fabfc7ef68d3faf6caab59af7caa40947b3c Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Fri, 27 May 2022 22:03:58 -0700 Subject: [PATCH 28/36] Add volumes using the same method. --- .../controllers/raycluster/common/pod.go | 61 +++++++++++-------- .../controllers/raycluster/common/pod_test.go | 12 ++++ 2 files changed, 47 insertions(+), 26 deletions(-) diff --git a/ray-operator/controllers/raycluster/common/pod.go b/ray-operator/controllers/raycluster/common/pod.go index 225111ebf6f..f5ebe14517a 100644 --- a/ray-operator/controllers/raycluster/common/pod.go +++ b/ray-operator/controllers/raycluster/common/pod.go @@ -22,6 +22,7 @@ const ( SharedMemoryVolumeMountPath = "/dev/shm" RayLogVolumeName = "ray-logs" RayLogVolumeMountPath = "/tmp/ray" + AutoscalerContainerName = "autoscaler" ) var log = logf.Log.WithName("RayCluster-Controller") @@ -110,32 +111,34 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN ObjectMeta: podTemplateSpec.ObjectMeta, Spec: podTemplateSpec.Spec, } - index := getRayContainerIndex(pod) + rayContainerIndex, rayContainer := getRayContainer(pod) //Add /dev/shm volumeMount for the object store to avoid performance degradation. - addEmptyDir(&pod.Spec.Containers[index], &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath, v1.StorageMediumMemory) + addEmptyDir(&rayContainer, &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath, v1.StorageMediumMemory) if rayNodeType == rayiov1alpha1.HeadNode && enableRayAutoscaler != nil && *enableRayAutoscaler { //The Ray autoscaler communicates with the Ray head via a shared log directory. //Specifically, we need a shared log volume to enable the event-logging functionality //introduced in https://github.com/ray-project/ray/pull/13434. - addEmptyDir(&pod.Spec.Containers[index], &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) + autoscalerContainer := getAutoscalerContainer(pod) + addEmptyDir(&rayContainer, &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) + addEmptyDir(&autoscalerContainer, &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) } - cleanupInvalidVolumeMounts(&pod.Spec.Containers[index], &pod) - if len(pod.Spec.InitContainers) > index { - cleanupInvalidVolumeMounts(&pod.Spec.InitContainers[index], &pod) + cleanupInvalidVolumeMounts(&rayContainer, &pod) + if len(pod.Spec.InitContainers) > rayContainerIndex { + cleanupInvalidVolumeMounts(&pod.Spec.InitContainers[rayContainerIndex], &pod) } var cmd, args string - if len(pod.Spec.Containers[index].Command) > 0 { - cmd = convertCmdToString(pod.Spec.Containers[index].Command) + if len(rayContainer.Command) > 0 { + cmd = convertCmdToString(rayContainer.Command) } - if len(pod.Spec.Containers[index].Args) > 0 { - cmd += convertCmdToString(pod.Spec.Containers[index].Args) + if len(rayContainer.Args) > 0 { + cmd += convertCmdToString(rayContainer.Args) } if !strings.Contains(cmd, "ray start") { - cont := concatenateContainerCommand(rayNodeType, rayStartParams, pod.Spec.Containers[index].Resources) + cont := concatenateContainerCommand(rayNodeType, rayStartParams, rayContainer.Resources) // replacing the old command - pod.Spec.Containers[index].Command = []string{"/bin/bash", "-c", "--"} + rayContainer.Command = []string{"/bin/bash", "-c", "--"} if cmd != "" { args = fmt.Sprintf("%s && %s", cont, cmd) } else { @@ -147,14 +150,14 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN args = args + " && sleep infinity" } - pod.Spec.Containers[index].Args = []string{args} + rayContainer.Args = []string{args} } for index := range pod.Spec.InitContainers { setInitContainerEnvVars(&pod.Spec.InitContainers[index], svcName) } - setContainerEnvVars(&pod.Spec.Containers[index], rayNodeType, rayStartParams, svcName) + setContainerEnvVars(&rayContainer, rayNodeType, rayStartParams, svcName) return pod } @@ -162,7 +165,7 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN // BuildAutoscalerContainer builds a Ray autoscaler container which can be appended to the head pod. func BuildAutoscalerContainer() v1.Container { container := v1.Container{ - Name: "autoscaler", + Name: AutoscalerContainerName, // TODO: choose right version based on instance.spec.Version // The currently used image reflects changes up to https://github.com/ray-project/ray/pull/24718 Image: "rayproject/ray:448f52", @@ -206,13 +209,6 @@ func BuildAutoscalerContainer() v1.Container { v1.ResourceMemory: resource.MustParse("256Mi"), }, }, - // Needed to allow the Ray driver to pick up autoscaler events. - VolumeMounts: []v1.VolumeMount{ - { - MountPath: RayLogVolumeMountPath, - Name: RayLogVolumeName, - }, - }, } return container } @@ -232,19 +228,32 @@ func convertCmdToString(cmdArr []string) (cmd string) { return cmdAggr.String() } -func getRayContainerIndex(pod v1.Pod) (index int) { - // theoretically, a ray pod can have multiple containers. +func getRayContainer(pod v1.Pod) (rayContainerIndex int, rayContainer v1.Container) { + // a ray pod can have multiple containers. // we identify the ray container based on env var: RAY=true // if the env var is missing, we choose containers[0]. for i, container := range pod.Spec.Containers { for _, env := range container.Env { if env.Name == strings.ToLower("ray") && env.Value == strings.ToLower("true") { - return i + return i, container } } } // not found, use first container - return 0 + return 0, pod.Spec.Containers[0] +} + +func getAutoscalerContainer(pod v1.Pod) (rayContainer v1.Container) { + // we identify the autoscaler container based on its name + for _, container := range pod.Spec.Containers { + if container.Name == AutoscalerContainerName { + return container + } + } + // The autoscaler container should have been found, so this branch shouldn't be accessed. + // The unit tests cover correct formatting of the autoscaling container, so + // we don't need to bubble up an error here. + return v1.Container{} } // labelPod returns the labels for selecting the resources diff --git a/ray-operator/controllers/raycluster/common/pod_test.go b/ray-operator/controllers/raycluster/common/pod_test.go index 79dd7c2bcff..bc23ffe7053 100644 --- a/ray-operator/controllers/raycluster/common/pod_test.go +++ b/ray-operator/controllers/raycluster/common/pod_test.go @@ -322,6 +322,18 @@ func TestBuildPod_WithAutoscalerEnabled(t *testing.T) { t.Fatalf("Expected `%v` but got `%v`", actualVolumes, expectedVolumes) } + // Make sure autoscaler container was formatted correctly. + numContainers := len(pod.Spec.Containers) + expectedNumContainers := 2 + if !(numContainers == expectedNumContainers) { + t.Fatalf("Expected `%v` container but got `%v`", actualVolumes, expectedVolumes) + } + actualContainer := getAutoscalerContainer(pod) + expectedContainer := autoscalerContainer + if !reflect.DeepEqual(expectedContainer, actualContainer) { + t.Fatalf("Expected `%v` but got `%v`", expectedContainer, actualContainer) + } + } func TestDefaultHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) { From fe6da32abf589d61db98522fcd5710c6a27dff6c Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Fri, 27 May 2022 23:58:15 -0700 Subject: [PATCH 29/36] Reuse function to add volume. --- .../controllers/raycluster/common/pod.go | 87 +++++++++++-------- .../controllers/raycluster/common/pod_test.go | 19 ++-- 2 files changed, 58 insertions(+), 48 deletions(-) diff --git a/ray-operator/controllers/raycluster/common/pod.go b/ray-operator/controllers/raycluster/common/pod.go index f5ebe14517a..25fa90f62dd 100644 --- a/ray-operator/controllers/raycluster/common/pod.go +++ b/ray-operator/controllers/raycluster/common/pod.go @@ -111,34 +111,36 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN ObjectMeta: podTemplateSpec.ObjectMeta, Spec: podTemplateSpec.Spec, } - rayContainerIndex, rayContainer := getRayContainer(pod) + rayContainerIndex := getRayContainerIndex(pod) //Add /dev/shm volumeMount for the object store to avoid performance degradation. - addEmptyDir(&rayContainer, &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath, v1.StorageMediumMemory) + addEmptyDir(&pod.Spec.Containers[rayContainerIndex], &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath, v1.StorageMediumMemory) if rayNodeType == rayiov1alpha1.HeadNode && enableRayAutoscaler != nil && *enableRayAutoscaler { - //The Ray autoscaler communicates with the Ray head via a shared log directory. - //Specifically, we need a shared log volume to enable the event-logging functionality + //The Ray autoscaler writes logs which are read by the Ray head. + //We need a shared log volume to enable this information flow. + //Specifically, this is required for the event-logging functionality //introduced in https://github.com/ray-project/ray/pull/13434. - autoscalerContainer := getAutoscalerContainer(pod) - addEmptyDir(&rayContainer, &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) - addEmptyDir(&autoscalerContainer, &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) + autoscalerContainerIndex := getAutoscalerContainerIndex(pod) + addEmptyDir(&pod.Spec.Containers[rayContainerIndex], &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) + addEmptyDir(&pod.Spec.Containers[autoscalerContainerIndex], &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) } - cleanupInvalidVolumeMounts(&rayContainer, &pod) + cleanupInvalidVolumeMounts(&pod.Spec.Containers[rayContainerIndex], &pod) + fmt.Printf("%v", pod.Spec.Volumes) if len(pod.Spec.InitContainers) > rayContainerIndex { cleanupInvalidVolumeMounts(&pod.Spec.InitContainers[rayContainerIndex], &pod) } var cmd, args string - if len(rayContainer.Command) > 0 { - cmd = convertCmdToString(rayContainer.Command) + if len(pod.Spec.Containers[rayContainerIndex].Command) > 0 { + cmd = convertCmdToString(pod.Spec.Containers[rayContainerIndex].Command) } - if len(rayContainer.Args) > 0 { - cmd += convertCmdToString(rayContainer.Args) + if len(pod.Spec.Containers[rayContainerIndex].Args) > 0 { + cmd += convertCmdToString(pod.Spec.Containers[rayContainerIndex].Args) } if !strings.Contains(cmd, "ray start") { - cont := concatenateContainerCommand(rayNodeType, rayStartParams, rayContainer.Resources) + cont := concatenateContainerCommand(rayNodeType, rayStartParams, pod.Spec.Containers[rayContainerIndex].Resources) // replacing the old command - rayContainer.Command = []string{"/bin/bash", "-c", "--"} + pod.Spec.Containers[rayContainerIndex].Command = []string{"/bin/bash", "-c", "--"} if cmd != "" { args = fmt.Sprintf("%s && %s", cont, cmd) } else { @@ -150,14 +152,14 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN args = args + " && sleep infinity" } - rayContainer.Args = []string{args} + pod.Spec.Containers[rayContainerIndex].Args = []string{args} } for index := range pod.Spec.InitContainers { setInitContainerEnvVars(&pod.Spec.InitContainers[index], svcName) } - setContainerEnvVars(&rayContainer, rayNodeType, rayStartParams, svcName) + setContainerEnvVars(&pod.Spec.Containers[rayContainerIndex], rayNodeType, rayStartParams, svcName) return pod } @@ -228,32 +230,33 @@ func convertCmdToString(cmdArr []string) (cmd string) { return cmdAggr.String() } -func getRayContainer(pod v1.Pod) (rayContainerIndex int, rayContainer v1.Container) { +func getRayContainerIndex(pod v1.Pod) (rayContainerIndex int) { // a ray pod can have multiple containers. // we identify the ray container based on env var: RAY=true // if the env var is missing, we choose containers[0]. for i, container := range pod.Spec.Containers { for _, env := range container.Env { if env.Name == strings.ToLower("ray") && env.Value == strings.ToLower("true") { - return i, container + return i } } } // not found, use first container - return 0, pod.Spec.Containers[0] + return 0 } -func getAutoscalerContainer(pod v1.Pod) (rayContainer v1.Container) { +func getAutoscalerContainerIndex(pod v1.Pod) (autoscalerContainerIndex int) { // we identify the autoscaler container based on its name - for _, container := range pod.Spec.Containers { + for i, container := range pod.Spec.Containers { if container.Name == AutoscalerContainerName { - return container + return i } } - // The autoscaler container should have been found, so this branch shouldn't be accessed. - // The unit tests cover correct formatting of the autoscaling container, so - // we don't need to bubble up an error here. - return v1.Container{} + + // not found, use second container + // (This code shouldn't be accessed.) + // (In any case, unit tests validate formatting of the autoscaler container.) + return 1 } // labelPod returns the labels for selecting the resources @@ -425,25 +428,29 @@ func convertParamMap(rayStartParams map[string]string) (s string) { return flags.String() } -// addEmptyDir add an emptyDir to the shared memory mount point /dev/shm -// this is to avoid: "The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This may slow down performance!..."" +// addEmptyDir adds an emptyDir volume to the pod and a corresponding volume mount to the container +// Used for a /dev/shm memory mount for object store and for a /tmp/ray disk mount for autoscaler logs. func addEmptyDir(container *v1.Container, pod *v1.Pod, volumeName string, volumeMountPath string, storageMedium v1.StorageMedium) { + fmt.Printf(">>>>>ENTERED FUNCTION!!!!!") if checkIfVolumeMounted(container, pod, volumeMountPath) { return } - // 1) create a Volume of type emptyDir and add it to Volumes - emptyDirVolume := makeEmptyDirVolume(container, volumeName, storageMedium) - pod.Spec.Volumes = append(pod.Spec.Volumes, emptyDirVolume) + fmt.Printf(">>>>>DOING STUFF!!!!!") + // 1) If needed, create a Volume of type emptyDir and add it to Volumes. + if !checkIfVolumeExists(pod, volumeName) { + fmt.Printf(">>>>>ADDING EMPTYDIR VOLUME!!!!!") + emptyDirVolume := makeEmptyDirVolume(container, volumeName, storageMedium) + pod.Spec.Volumes = append(pod.Spec.Volumes, emptyDirVolume) + fmt.Printf("%v", pod.Spec.Volumes) + } - // 2) create a VolumeMount that uses the emptyDir + // 2) Create a VolumeMount that uses the emptyDir. mountedVolume := v1.VolumeMount{ MountPath: volumeMountPath, Name: volumeName, ReadOnly: false, } - if !checkIfVolumeMounted(container, pod, volumeMountPath) { - container.VolumeMounts = append(container.VolumeMounts, mountedVolume) - } + container.VolumeMounts = append(container.VolumeMounts, mountedVolume) } //Format an emptyDir volume. @@ -485,6 +492,16 @@ func checkIfVolumeMounted(container *v1.Container, pod *v1.Pod, volumeMountPath return false } +//Checks if a volume with the given name exists. +func checkIfVolumeExists(pod *v1.Pod, volumeName string) bool { + for _, podVolume := range pod.Spec.Volumes { + if podVolume.Name == volumeName { + return true + } + } + return false +} + func cleanupInvalidVolumeMounts(container *v1.Container, pod *v1.Pod) { // if a volumeMount is specified in the container, // but has no corresponding pod volume, it is removed diff --git a/ray-operator/controllers/raycluster/common/pod_test.go b/ray-operator/controllers/raycluster/common/pod_test.go index bc23ffe7053..ccc4fce95cb 100644 --- a/ray-operator/controllers/raycluster/common/pod_test.go +++ b/ray-operator/controllers/raycluster/common/pod_test.go @@ -252,13 +252,13 @@ func TestBuildPod(t *testing.T) { actualVolumes := pod.Spec.Volumes expectedVolumes := volumesNoAutoscaler if !reflect.DeepEqual(actualVolumes, expectedVolumes) { - t.Fatalf("Expected `%v` but got `%v`", actualVolumes, expectedVolumes) + t.Fatalf("Expected `%v` but got `%v`", expectedVolumes, actualVolumes) } actualVolumeMounts := pod.Spec.Containers[0].VolumeMounts expectedVolumeMounts := volumeMountsNoAutoscaler if !reflect.DeepEqual(actualVolumeMounts, expectedVolumeMounts) { - t.Fatalf("Expected `%v` but got `%v`", actualVolumes, expectedVolumes) + t.Fatalf("Expected `%v` but got `%v`", expectedVolumeMounts, actualVolumeMounts) } // testing worker pod @@ -319,16 +319,17 @@ func TestBuildPod_WithAutoscalerEnabled(t *testing.T) { actualVolumeMounts := pod.Spec.Containers[0].VolumeMounts expectedVolumeMounts := volumeMountsWithAutoscaler if !reflect.DeepEqual(actualVolumeMounts, expectedVolumeMounts) { - t.Fatalf("Expected `%v` but got `%v`", actualVolumes, expectedVolumes) + t.Fatalf("Expected `%v` but got `%v`", expectedVolumeMounts, actualVolumeMounts) } // Make sure autoscaler container was formatted correctly. numContainers := len(pod.Spec.Containers) expectedNumContainers := 2 if !(numContainers == expectedNumContainers) { - t.Fatalf("Expected `%v` container but got `%v`", actualVolumes, expectedVolumes) + t.Fatalf("Expected `%v` container but got `%v`", expectedVolumes, actualVolumes) } - actualContainer := getAutoscalerContainer(pod) + index := getAutoscalerContainerIndex(pod) + actualContainer := pod.Spec.Containers[index] expectedContainer := autoscalerContainer if !reflect.DeepEqual(expectedContainer, actualContainer) { t.Fatalf("Expected `%v` but got `%v`", expectedContainer, actualContainer) @@ -358,14 +359,6 @@ func TestDefaultHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) { } } -func TestBuildAutoscalerContainer(t *testing.T) { - actualContainer := BuildAutoscalerContainer() - expectedContainer := autoscalerContainer - if !reflect.DeepEqual(expectedContainer, actualContainer) { - t.Fatalf("Expected `%v` but got `%v`", expectedContainer, actualContainer) - } -} - func splitAndSort(s string) []string { strs := strings.Split(s, " ") result := make([]string, 0, len(strs)) From d0f98ceff848aaec0a2b5ac91fb7a30cdb2474ef Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Sat, 28 May 2022 00:08:50 -0700 Subject: [PATCH 30/36] Remove print statements --- ray-operator/controllers/ray/common/pod.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 3be604db43a..dcc54d474fd 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -125,7 +125,6 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN addEmptyDir(&pod.Spec.Containers[autoscalerContainerIndex], &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) } cleanupInvalidVolumeMounts(&pod.Spec.Containers[rayContainerIndex], &pod) - fmt.Printf("%v", pod.Spec.Volumes) if len(pod.Spec.InitContainers) > rayContainerIndex { cleanupInvalidVolumeMounts(&pod.Spec.InitContainers[rayContainerIndex], &pod) } @@ -431,17 +430,13 @@ func convertParamMap(rayStartParams map[string]string) (s string) { // addEmptyDir adds an emptyDir volume to the pod and a corresponding volume mount to the container // Used for a /dev/shm memory mount for object store and for a /tmp/ray disk mount for autoscaler logs. func addEmptyDir(container *v1.Container, pod *v1.Pod, volumeName string, volumeMountPath string, storageMedium v1.StorageMedium) { - fmt.Printf(">>>>>ENTERED FUNCTION!!!!!") if checkIfVolumeMounted(container, pod, volumeMountPath) { return } - fmt.Printf(">>>>>DOING STUFF!!!!!") // 1) If needed, create a Volume of type emptyDir and add it to Volumes. if !checkIfVolumeExists(pod, volumeName) { - fmt.Printf(">>>>>ADDING EMPTYDIR VOLUME!!!!!") emptyDirVolume := makeEmptyDirVolume(container, volumeName, storageMedium) pod.Spec.Volumes = append(pod.Spec.Volumes, emptyDirVolume) - fmt.Printf("%v", pod.Spec.Volumes) } // 2) Create a VolumeMount that uses the emptyDir. From 8f7d64e99a987a4d97dcf1c1c6aae40cbf94804b Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Sat, 28 May 2022 00:13:21 -0700 Subject: [PATCH 31/36] raycluster -> ray --- ray-operator/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/main.go b/ray-operator/main.go index 4edfa48c725..d73ff66d731 100644 --- a/ray-operator/main.go +++ b/ray-operator/main.go @@ -70,7 +70,7 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) setupLog.Info("the operator", "version:", os.Getenv("OPERATOR_VERSION")) - if raycluster.PrioritizeWorkersToDelete { + if ray.PrioritizeWorkersToDelete { setupLog.Info("Feature flag prioritize-workers-to-delete is enabled.") } From c8951300c5076768c52a97871e947643ae610f1d Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Sat, 28 May 2022 00:20:47 -0700 Subject: [PATCH 32/36] explain --- ray-operator/controllers/ray/common/pod.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index dcc54d474fd..32215e2e1ec 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -253,7 +253,7 @@ func getAutoscalerContainerIndex(pod v1.Pod) (autoscalerContainerIndex int) { } // not found, use second container - // (This code shouldn't be accessed.) + // (This branch shouldn't be accessed -- the autoscaler container should be present.) // (In any case, unit tests validate formatting of the autoscaler container.) return 1 } From f601cb8bdd31a34d34e3372744a9ca13ef66c1f8 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Sat, 28 May 2022 00:22:08 -0700 Subject: [PATCH 33/36] Typo --- ray-operator/controllers/ray/common/pod.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 32215e2e1ec..0168d963b50 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -450,7 +450,7 @@ func addEmptyDir(container *v1.Container, pod *v1.Pod, volumeName string, volume //Format an emptyDir volume. //When the storage medium is memory, set the size limit based on container resources. -//For others media , don't set a size limit. +//For other media, don't set a size limit. func makeEmptyDirVolume(container *v1.Container, volumeName string, storageMedium v1.StorageMedium) v1.Volume { var sizeLimit *resource.Quantity if storageMedium == v1.StorageMediumMemory { From 704519d01442f315f70de47312ba6c5cde0b02be Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Sat, 28 May 2022 00:33:24 -0700 Subject: [PATCH 34/36] pods.go: Spaces --- ray-operator/controllers/ray/common/pod.go | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 0168d963b50..f776ed882ac 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -113,13 +113,13 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN } rayContainerIndex := getRayContainerIndex(pod) - //Add /dev/shm volumeMount for the object store to avoid performance degradation. + // Add /dev/shm volumeMount for the object store to avoid performance degradation. addEmptyDir(&pod.Spec.Containers[rayContainerIndex], &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath, v1.StorageMediumMemory) if rayNodeType == rayiov1alpha1.HeadNode && enableRayAutoscaler != nil && *enableRayAutoscaler { - //The Ray autoscaler writes logs which are read by the Ray head. - //We need a shared log volume to enable this information flow. - //Specifically, this is required for the event-logging functionality - //introduced in https://github.com/ray-project/ray/pull/13434. + // The Ray autoscaler writes logs which are read by the Ray head. + // We need a shared log volume to enable this information flow. + // Specifically, this is required for the event-logging functionality + // introduced in https://github.com/ray-project/ray/pull/13434. autoscalerContainerIndex := getAutoscalerContainerIndex(pod) addEmptyDir(&pod.Spec.Containers[rayContainerIndex], &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) addEmptyDir(&pod.Spec.Containers[autoscalerContainerIndex], &pod, RayLogVolumeName, RayLogVolumeMountPath, v1.StorageMediumDefault) @@ -448,16 +448,16 @@ func addEmptyDir(container *v1.Container, pod *v1.Pod, volumeName string, volume container.VolumeMounts = append(container.VolumeMounts, mountedVolume) } -//Format an emptyDir volume. -//When the storage medium is memory, set the size limit based on container resources. -//For other media, don't set a size limit. +// Format an emptyDir volume. +// When the storage medium is memory, set the size limit based on container resources. +// For other media, don't set a size limit. func makeEmptyDirVolume(container *v1.Container, volumeName string, storageMedium v1.StorageMedium) v1.Volume { var sizeLimit *resource.Quantity if storageMedium == v1.StorageMediumMemory { - //If using memory, set size limit based on primary container's resources. + // If using memory, set size limit based on primary container's resources. sizeLimit = findMemoryReqOrLimit(*container) } else { - //Otherwise, don't set a limit. + // Otherwise, don't set a limit. sizeLimit = nil } return v1.Volume{ @@ -471,8 +471,8 @@ func makeEmptyDirVolume(container *v1.Container, volumeName string, storageMediu } } -//Checks if the container has a volumeMount with the given mount path and if -//the pod has a matching Volume. +// Checks if the container has a volumeMount with the given mount path and if +// the pod has a matching Volume. func checkIfVolumeMounted(container *v1.Container, pod *v1.Pod, volumeMountPath string) bool { for _, mountedVol := range container.VolumeMounts { if mountedVol.MountPath == volumeMountPath { @@ -487,7 +487,7 @@ func checkIfVolumeMounted(container *v1.Container, pod *v1.Pod, volumeMountPath return false } -//Checks if a volume with the given name exists. +// Checks if a volume with the given name exists. func checkIfVolumeExists(pod *v1.Pod, volumeName string) bool { for _, podVolume := range pod.Spec.Volumes { if podVolume.Name == volumeName { From 05a18d27dab0e4bc4e122d0fcb9c0e9c932e1879 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Sat, 28 May 2022 00:39:24 -0700 Subject: [PATCH 35/36] Test Typo --- ray-operator/controllers/ray/common/pod_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index e2b4ee47f3c..d3514c5496d 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -326,7 +326,7 @@ func TestBuildPod_WithAutoscalerEnabled(t *testing.T) { numContainers := len(pod.Spec.Containers) expectedNumContainers := 2 if !(numContainers == expectedNumContainers) { - t.Fatalf("Expected `%v` container but got `%v`", expectedVolumes, actualVolumes) + t.Fatalf("Expected `%v` container but got `%v`", expectedNumContainers, numContainers) } index := getAutoscalerContainerIndex(pod) actualContainer := pod.Spec.Containers[index] From 0cef0cc6676368f41023ad38ef25f80387b211f8 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman Date: Sat, 28 May 2022 15:54:13 -0700 Subject: [PATCH 36/36] Container indices: Log and panic. --- ray-operator/controllers/ray/common/pod.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index f776ed882ac..780363f1f02 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -236,11 +236,13 @@ func getRayContainerIndex(pod v1.Pod) (rayContainerIndex int) { for i, container := range pod.Spec.Containers { for _, env := range container.Env { if env.Name == strings.ToLower("ray") && env.Value == strings.ToLower("true") { + log.Info("Head pod container with index " + strconv.Itoa(i) + " identified as Ray container based on env RAY=true.") return i } } } // not found, use first container + log.Info("Head pod container with index 0 identified as Ray container.") return 0 } @@ -252,10 +254,8 @@ func getAutoscalerContainerIndex(pod v1.Pod) (autoscalerContainerIndex int) { } } - // not found, use second container - // (This branch shouldn't be accessed -- the autoscaler container should be present.) - // (In any case, unit tests validate formatting of the autoscaler container.) - return 1 + // This should be unreachable. + panic("Autoscaler container not found!") } // labelPod returns the labels for selecting the resources