From 3e11cdea0fe762b217154044803571a6aecb3038 Mon Sep 17 00:00:00 2001 From: Jiaxin Shan Date: Thu, 12 Aug 2021 20:28:54 -0700 Subject: [PATCH 1/3] Update training operator release process (#1347) 1. Change release process 2. Checking python tool to generate changelog 3. Fix v1.2.0 changlog --- CHANGELOG.md | 4 +- docs/release/release.py | 43 +++++++++++ docs/release/releasing.md | 156 +++++++++++++++++++++++++------------- 3 files changed, 148 insertions(+), 55 deletions(-) create mode 100644 docs/release/release.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fda7eec7e..f20be8b4e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ # Changelog -## [v1.1.1](https://github.com/kubeflow/tf-operator/tree/v1.1.1) (2021-08-03) +## [v1.2.0](https://github.com/kubeflow/tf-operator/tree/v1.2.0) (2021-08-03) -[Full Changelog](https://github.com/kubeflow/tf-operator/compare/v1.1.0...v1.1.1) +[Full Changelog](https://github.com/kubeflow/tf-operator/compare/v1.1.0...v1.2.0) ## Features diff --git a/docs/release/release.py b/docs/release/release.py new file mode 100644 index 0000000000..0dffd95c4b --- /dev/null +++ b/docs/release/release.py @@ -0,0 +1,43 @@ +from github import Github +import re + + +class ChangelogGenerator: + def __init__(self, github_repo): + # Replace with your Github Token + self._github = Github('') + self._github_repo = self._github.get_repo(github_repo) + + def generate(self, pr_id): + pr = self._github_repo.get_pull(pr_id) + + return "{title} ([#{pr_id}]({pr_link}), @{user})".format( + title=pr.title, + pr_id=pr_id, + pr_link=pr.html_url, + user=pr.user.login + ) + + +# generated by `git log ..HEAD --oneline` +payload = ''' +6f1e96c4 Update container image for v1.1.1 (#1328) +47a74b73 add a specific version of tensorflow_datasets (#1305) +e3061132 Remove vendor folder (#1288) +eb362bd8 Fix invalid pointer when tfjob is deleted (#1285) +0c41b273 fix get_logs pod_names type and iteration blocking (#1280) +af5bdd58 Add job namespace to `tf_operator_jobs_*` counters (#1283) +6fd9489e fix custom_api.delete_namespaced_custom_object args (#1281) +c095f7a9 feat: upgrade kubeflow common and volcano version (#1276) +13b17b0e Use remote Kustomize build option in standalone installation instructions (#1266) +faf34868 fix: Remove the dup comment tag (#1274) +9a297876 add podgroups rule in cluster-role.yaml (#1272) +58c9bc4a Fix: the "follow" of TFJobClient.get_logs (#1254) +3d9e7c8a Add task type annotation for pods when EnableGangScheduling is true. (#1268) +8d179f70 Fix: Remove Github CD workflow (#1263) +''' + +g = ChangelogGenerator("kubeflow/tf-operator") +for pr_match in re.finditer(r"#(\d+)", payload): + pr_id = int(pr_match.group(1)) + print("* {}".format(g.generate(pr_id))) diff --git a/docs/release/releasing.md b/docs/release/releasing.md index a1f6f49001..ced3d7fd0a 100644 --- a/docs/release/releasing.md +++ b/docs/release/releasing.md @@ -1,53 +1,103 @@ -# Releasing the TFJob operator - -Permissions - - * You need to be a member of release-team@kubeflow.org to have access to the GCP - resources used for releasing. - - * You need write permissions on the repository to create a release branch. - - -Look at the [postsubmit dashboard](https://k8s-testgrid.appspot.com/sig-big-data#kubeflow-tf-operator-postsubmit) -to find the latest green postsubmit. - - -Use the GitHub UI to cut a release branch - * Name the release branch v{MAJOR}.${MINOR}-branch - -Checkout the release branch - -We build TFJob operator by running the E2E test workflow. - -Look at the [postsubmit dashboard](https://k8s-testgrid.appspot.com/sig-big-data#kubeflow-tf-operator-postsubmit) -to find the latest green postsubmit. - -Check out that commit (in this example, we'll use `6214e560`): - -Run the E2E test workflow using our release cluster - -[kubeflow/testing#42](https://github.com/kubeflow/testing/issues/42) will simplify this. - -``` -submit_release_job.sh ${COMMIT} -``` - -You can monitor the workflow using the Argo UI. For our release cluster, we don't expose the Argo UI publicly, so you'll need to connect via kubectl port-forward: - -``` -kubectl -n kubeflow-releasing port-forward `kubectl -n kubeflow-releasing get pods --selector=app=argo-ui -o jsonpath='{.items[0].metadata.name}'` 8080:8001 -``` - -[kubeflow/testing#43](https://github.com/kubeflow/testing/issues/43) is tracking setup of IAP to make this easier. - -Make sure the Argo workflow completes successfully. -Check the junit files to make sure there were no actual test failures. -The junit files will be in [gs://kubeflow-releasing-artifacts](https://console.cloud.google.com/storage/browser/kubeflow-releasing-artifacts/logs/kubeflow_tf-operator/tf-operator-release/?project=kubeflow-releasing). - * The build artifacts will be in a directory named after the build number - -If the tests pass use the GitHub UI to create a release tagged v{MAJOR}-{MINOR}-{PATCH} - * If its an RC append -RC.N - * In the notes create a link to the Docker image in GCR - * For the label use the `sha256` and not the label so it is immutable. - -To release new ksonnet configs with the image following [kubeflow/kubeflow/releasing.md](https://github.com/kubeflow/kubeflow/blob/master/releasing.md). +# Releasing the training operator + +## Prerequisite + +1. Permissions + - You need to be a member of release-team@kubeflow.org. + - You need write permissions on the repository to create a release tag/branch. + +2. Prepare your Github Token + +3. Install Github python dependencies to generate changlog + ``` + pip install PyGithub + ``` + +### Release Process + +1. Make sure the last commit you want to release past `kubeflow-tf-operator-postsubmit` testing. + +1. Check out that commit (in this example, we'll use `6214e560`). + +1. Depends on what version you want to release, + - Major or Minor version - Use the GitHub UI to cut a release branch and name the release branch `v{MAJOR}.${MINOR}-branch` + - Patch version - You don't need to cut release branch. + +1. Create a new PR against the release branch to change container image in manifest to point to that commit hash. + + ``` + images: + - name: kubeflow/training-operator + newName: kubeflow/training-operator + newTag: ${commit_hash} + ``` + + > note: post submit job will always build a new image using the `PULL_BASE_HASH` as image tag. + +1. Create a tag and push tag to upstream. + + ``` + git tag v1.2.0 + git push upstream v1.2.0 + ``` + +1. Run following code and fetch online git commits from last release (v1.1.0) to current release (v1.2.0). + + ``` + git log v1.1.0..v1.2.0 --oneline + ``` + +1. Copy above commit history to `release.py` and replace `` with your Github token. + Run this python scripts to generate changelogs. + + ``` + from github import Github + import re + + + class ChangelogGenerator: + def __init__(self, github_repo): + # Replace with your Github Token + self._github = Github('') + self._github_repo = self._github.get_repo(github_repo) + + def generate(self, pr_id): + pr = self._github_repo.get_pull(pr_id) + + return "{title} ([#{pr_id}]({pr_link}), @{user})".format( + title=pr.title, + pr_id=pr_id, + pr_link=pr.html_url, + user=pr.user.login + ) + + + # generated by `git log .. --oneline` + payload = ''' + 6f1e96c4 Update container image for v1.2.0 (#1328) + 47a74b73 add a specific version of tensorflow_datasets (#1305) + e3061132 Remove vendor folder (#1288) + eb362bd8 Fix invalid pointer when tfjob is deleted (#1285) + 0c41b273 fix get_logs pod_names type and iteration blocking (#1280) + af5bdd58 Add job namespace to `tf_operator_jobs_*` counters (#1283) + 6fd9489e fix custom_api.delete_namespaced_custom_object args (#1281) + c095f7a9 feat: upgrade kubeflow common and volcano version (#1276) + 13b17b0e Use remote Kustomize build option in standalone installation instructions (#1266) + faf34868 fix: Remove the dup comment tag (#1274) + 9a297876 add podgroups rule in cluster-role.yaml (#1272) + 58c9bc4a Fix: the "follow" of TFJobClient.get_logs (#1254) + 3d9e7c8a Add task type annotation for pods when EnableGangScheduling is true. (#1268) + 8d179f70 Fix: Remove Github CD workflow (#1263) + ''' + + g = ChangelogGenerator("kubeflow/tf-operator") + for pr_match in re.finditer(r"#(\d+)", payload): + pr_id = int(pr_match.group(1)) + print("* {}".format(g.generate(pr_id))) + ``` + +1. Cut release from tags and copy results from last step. You can group commits into `Features`, `Bugs` etc. +See example [v1.2.0 release](https://github.com/kubeflow/tf-operator/releases/tag/v1.2.0) + +1. Send a PR to update [CHANGELOG.md](../../CHANGELOG.md) + \ No newline at end of file From 46c586463e3c13b0d4f2d534aae84a46329dc4ef Mon Sep 17 00:00:00 2001 From: Deepak Muley Date: Thu, 12 Aug 2021 23:10:54 -0700 Subject: [PATCH 2/3] 1322: Modified manifests to use all-in-one training-operator (#1346) * 1322: Modified manifests to use all-in-one training-operator WIP Actions taken: - replaced tf-job-operator => training-operator - replaced kubeflow-tfjobs- => kubeflow-training- - moved crds for mxjobs, tgjobs, pytorchjobs and xgboostjobs from config/crd/bases to manifests/base/ and prefixed them with crd_ Ref: https://github.com/kubeflow/tf-operator/issues/1322 Testing steps: To be added Work in Progress * 1322: synced up config/manager with manifests Training operator was found to be working
k -n kubeflow logs -f training-operator-694766989-pp2j4
I0812 21:43:24.739862       1 request.go:645] Throttling request took 1.048945631s, request: GET:https://172.19.0.1:443/apis/networking.k8s.io/v1?timeout=32s
2021-08-12T21:43:25.694Z	INFO	controller-runtime.metrics	metrics server is starting to listen	{"addr": ":8080"}
2021-08-12T21:43:25.790Z	INFO	setup	starting manager
2021-08-12T21:43:25.790Z	INFO	controller-runtime.manager	starting metrics server	{"path": "/metrics"}
2021-08-12T21:43:25.790Z	INFO	controller-runtime.manager.controller.tf-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:25.790Z	INFO	controller-runtime.manager.controller.mxnet-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:25.791Z	INFO	controller-runtime.manager.controller.pytorchjob-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:25.791Z	INFO	controller-runtime.manager.controller.xgboostjob-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:26.289Z	INFO	controller-runtime.manager.controller.xgboostjob-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:26.294Z	INFO	controller-runtime.manager.controller.pytorchjob-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:26.589Z	INFO	controller-runtime.manager.controller.mxnet-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:26.688Z	INFO	controller-runtime.manager.controller.tf-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:26.889Z	INFO	controller-runtime.manager.controller.tf-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:26.889Z	INFO	controller-runtime.manager.controller.pytorchjob-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:26.890Z	INFO	controller-runtime.manager.controller.xgboostjob-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:26.890Z	INFO	controller-runtime.manager.controller.mxnet-operator	Starting EventSource	{"source": "kind source: /, Kind="}
2021-08-12T21:43:26.990Z	INFO	controller-runtime.manager.controller.xgboostjob-operator	Starting Controller
2021-08-12T21:43:26.990Z	INFO	controller-runtime.manager.controller.tf-operator	Starting Controller
2021-08-12T21:43:26.990Z	INFO	controller-runtime.manager.controller.tf-operator	Starting workers	{"worker count": 1}
2021-08-12T21:43:26.990Z	INFO	controller-runtime.manager.controller.pytorchjob-operator	Starting Controller
2021-08-12T21:43:26.991Z	INFO	controller-runtime.manager.controller.xgboostjob-operator	Starting workers	{"worker count": 1}
2021-08-12T21:43:26.991Z	INFO	controller-runtime.manager.controller.pytorchjob-operator	Starting workers	{"worker count": 1}
2021-08-12T21:43:26.991Z	INFO	controller-runtime.manager.controller.mxnet-operator	Starting Controller
2021-08-12T21:43:26.991Z	INFO	controller-runtime.manager.controller.mxnet-operator	Starting workers	{"worker count": 1}
* 1322: incorporated review comments - added all resources in ClusterRole * 1322: incorporated review comments - now controller-gen generates the crds directly in manifests/base instead of config/crd/bases - updated setup-training-operator.sh to use manifests/overlays/standalone * 1322: removed config/crd/bases as its now getting generated in manifests * 1322: incorporated review comments related to using separate role files * 1322: removed image name replacement --- Makefile | 2 +- config/crd/kustomization.yaml | 13 -- config/crd/kustomizeconfig.yaml | 19 --- config/manager/manager.yaml | 57 ------- manifests/base/cluster-role-binding.yaml | 8 +- manifests/base/cluster-role.yaml | 139 ++++++------------ manifests/base/crd.yaml | 52 ------- manifests/base/deployment.yaml | 29 ---- .../base}/kubeflow.org_mxjobs.yaml | 0 .../base}/kubeflow.org_pytorchjobs.yaml | 0 .../base}/kubeflow.org_tfjobs.yaml | 0 .../base}/kubeflow.org_xgboostjobs.yaml | 0 manifests/base/kustomization.yaml | 19 +-- manifests/base/service-account.yaml | 4 +- manifests/base/service.yaml | 6 +- .../default/kustomization.yaml | 0 .../default/manager_config_patch.yaml | 0 .../manager/controller_manager_config.yaml | 0 .../manager/kustomization.yaml | 6 +- manifests/manager/manager.yaml | 50 +++++++ .../kubeflow/kubeflow-training-roles.yaml | 53 +++++++ .../overlays/kubeflow/kustomization.yaml | 16 +- .../overlays/standalone/kustomization.yaml | 17 +-- .../overlays/standalone_v2/kustomization.yaml | 15 -- .../overlays/standalone_v2/namespace.yaml | 4 - .../prometheus/kustomization.yaml | 0 {config => manifests}/prometheus/monitor.yaml | 0 .../rbac/auth_proxy_client_clusterrole.yaml | 0 .../rbac/auth_proxy_role.yaml | 0 .../rbac/auth_proxy_role_binding.yaml | 0 .../rbac/auth_proxy_service.yaml | 0 {config => manifests}/rbac/kustomization.yaml | 0 .../rbac/leader_election_role.yaml | 0 .../rbac/leader_election_role_binding.yaml | 0 .../rbac/mxjob_editor_role.yaml | 0 .../rbac/mxjob_viewer_role.yaml | 0 .../rbac/pytorchjob_editor_role.yaml | 0 .../rbac/pytorchjob_viewer_role.yaml | 0 {config => manifests}/rbac/role.yaml | 0 {config => manifests}/rbac/role_binding.yaml | 0 .../rbac/service_account.yaml | 0 .../rbac/tfjob_editor_role.yaml | 0 .../rbac/tfjob_viewer_role.yaml | 0 .../rbac/xgboostjob_editor_role.yaml | 0 .../rbac/xgboostjob_viewer_role.yaml | 0 scripts/setup-training-operator.sh | 2 +- 46 files changed, 180 insertions(+), 331 deletions(-) delete mode 100644 config/crd/kustomization.yaml delete mode 100644 config/crd/kustomizeconfig.yaml delete mode 100644 config/manager/manager.yaml delete mode 100644 manifests/base/crd.yaml delete mode 100644 manifests/base/deployment.yaml rename {config/crd/bases => manifests/base}/kubeflow.org_mxjobs.yaml (100%) rename {config/crd/bases => manifests/base}/kubeflow.org_pytorchjobs.yaml (100%) rename {config/crd/bases => manifests/base}/kubeflow.org_tfjobs.yaml (100%) rename {config/crd/bases => manifests/base}/kubeflow.org_xgboostjobs.yaml (100%) rename {config => manifests}/default/kustomization.yaml (100%) rename {config => manifests}/default/manager_config_patch.yaml (100%) rename {config => manifests}/manager/controller_manager_config.yaml (100%) rename {config => manifests}/manager/kustomization.yaml (65%) create mode 100644 manifests/manager/manager.yaml create mode 100644 manifests/overlays/kubeflow/kubeflow-training-roles.yaml delete mode 100644 manifests/overlays/standalone_v2/kustomization.yaml delete mode 100644 manifests/overlays/standalone_v2/namespace.yaml rename {config => manifests}/prometheus/kustomization.yaml (100%) rename {config => manifests}/prometheus/monitor.yaml (100%) rename {config => manifests}/rbac/auth_proxy_client_clusterrole.yaml (100%) rename {config => manifests}/rbac/auth_proxy_role.yaml (100%) rename {config => manifests}/rbac/auth_proxy_role_binding.yaml (100%) rename {config => manifests}/rbac/auth_proxy_service.yaml (100%) rename {config => manifests}/rbac/kustomization.yaml (100%) rename {config => manifests}/rbac/leader_election_role.yaml (100%) rename {config => manifests}/rbac/leader_election_role_binding.yaml (100%) rename {config => manifests}/rbac/mxjob_editor_role.yaml (100%) rename {config => manifests}/rbac/mxjob_viewer_role.yaml (100%) rename {config => manifests}/rbac/pytorchjob_editor_role.yaml (100%) rename {config => manifests}/rbac/pytorchjob_viewer_role.yaml (100%) rename {config => manifests}/rbac/role.yaml (100%) rename {config => manifests}/rbac/role_binding.yaml (100%) rename {config => manifests}/rbac/service_account.yaml (100%) rename {config => manifests}/rbac/tfjob_editor_role.yaml (100%) rename {config => manifests}/rbac/tfjob_viewer_role.yaml (100%) rename {config => manifests}/rbac/xgboostjob_editor_role.yaml (100%) rename {config => manifests}/rbac/xgboostjob_viewer_role.yaml (100%) diff --git a/Makefile b/Makefile index 41b0e3112d..6f935bdfcb 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ help: ## Display this help. ##@ Development manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. - $(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=manager-role webhook paths="./pkg/apis/..." output:crd:artifacts:config=config/crd/bases + $(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=manager-role webhook paths="./pkg/apis/..." output:crd:artifacts:config=manifests/base generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./pkg/apis/..." diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml deleted file mode 100644 index a590114997..0000000000 --- a/config/crd/kustomization.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# This kustomization.yaml is not intended to be run by itself, -# since it depends on service name and namespace that are out of this kustomize package. -# It should be run by config/default -resources: -- bases/kubeflow.org_xgboostjobs.yaml -- bases/kubeflow.org_pytorchjobs.yaml -- bases/kubeflow.org_tfjobs.yaml -- bases/kubeflow.org_mxjobs.yaml -#+kubebuilder:scaffold:crdkustomizeresource - -# the following config is for teaching kustomize how to do kustomization for CRDs. -configurations: -- kustomizeconfig.yaml diff --git a/config/crd/kustomizeconfig.yaml b/config/crd/kustomizeconfig.yaml deleted file mode 100644 index 13425cc3c6..0000000000 --- a/config/crd/kustomizeconfig.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# This file is for teaching kustomize how to substitute name and namespace reference in CRD -nameReference: - - kind: Service - version: v1 - fieldSpecs: - - kind: CustomResourceDefinition - version: v1 - group: apiextensions.k8s.io - path: spec/conversion/webhook/clientConfig/service/name - -namespace: - - kind: CustomResourceDefinition - version: v1 - group: apiextensions.k8s.io - path: spec/conversion/webhook/clientConfig/service/namespace - create: false - -varReference: - - path: metadata/annotations \ No newline at end of file diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml deleted file mode 100644 index 1c018da451..0000000000 --- a/config/manager/manager.yaml +++ /dev/null @@ -1,57 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - labels: - control-plane: kubeflow-training-operator - name: kubeflow ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: training-operator - namespace: system - labels: - control-plane: kubeflow-training-operator -spec: - selector: - matchLabels: - control-plane: kubeflow-training-operator - replicas: 1 - template: - metadata: - labels: - control-plane: kubeflow-training-operator - spec: -# securityContext: -# runAsNonRoot: true - containers: - - command: - - /manager -# disable leader-elect now -# args: -# - --leader-elect - image: kubeflow/training-operator:v1.0.0 - name: manager - securityContext: - allowPrivilegeEscalation: false - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - resources: - limits: - cpu: 100m - memory: 30Mi - requests: - cpu: 100m - memory: 20Mi - serviceAccountName: training-operator-service-account - terminationGracePeriodSeconds: 10 diff --git a/manifests/base/cluster-role-binding.yaml b/manifests/base/cluster-role-binding.yaml index e05aad7fc4..2e5b93dcb5 100644 --- a/manifests/base/cluster-role-binding.yaml +++ b/manifests/base/cluster-role-binding.yaml @@ -3,12 +3,12 @@ apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: labels: - app: tf-job-operator - name: tf-job-operator + app: training-operator + name: training-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: tf-job-operator + name: training-operator subjects: - kind: ServiceAccount - name: tf-job-operator + name: training-operator diff --git a/manifests/base/cluster-role.yaml b/manifests/base/cluster-role.yaml index 5e599a7831..702a3f8cd5 100644 --- a/manifests/base/cluster-role.yaml +++ b/manifests/base/cluster-role.yaml @@ -3,100 +3,47 @@ apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRole metadata: labels: - app: tf-job-operator - name: tf-job-operator + app: training-operator + name: training-operator rules: -- apiGroups: - - kubeflow.org - resources: - - tfjobs - - tfjobs/status - - tfjobs/finalizers - verbs: - - '*' -- apiGroups: - - apiextensions.k8s.io - resources: - - customresourcedefinitions - verbs: - - '*' -- apiGroups: - - "" - resources: - - pods - - services - - endpoints - - events - verbs: - - '*' -- apiGroups: - - apps - - extensions - resources: - - deployments - verbs: - - '*' -- apiGroups: - - scheduling.volcano.sh - resources: - - podgroups - verbs: - - '*' - ---- - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: kubeflow-tfjobs-admin - labels: - rbac.authorization.kubeflow.org/aggregate-to-kubeflow-admin: "true" -aggregationRule: - clusterRoleSelectors: - - matchLabels: - rbac.authorization.kubeflow.org/aggregate-to-kubeflow-tfjobs-admin: "true" -rules: [] - ---- - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: kubeflow-tfjobs-edit - labels: - rbac.authorization.kubeflow.org/aggregate-to-kubeflow-edit: "true" - rbac.authorization.kubeflow.org/aggregate-to-kubeflow-tfjobs-admin: "true" -rules: -- apiGroups: - - kubeflow.org - resources: - - tfjobs - - tfjobs/status - verbs: - - get - - list - - watch - - create - - delete - - deletecollection - - patch - - update - ---- - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: kubeflow-tfjobs-view - labels: - rbac.authorization.kubeflow.org/aggregate-to-kubeflow-view: "true" -rules: -- apiGroups: - - kubeflow.org - resources: - - tfjobs - - tfjobs/status - verbs: - - get - - list - - watch + - apiGroups: + - kubeflow.org + resources: + - tfjobs + - mxjobs + - pytorchjobs + - xgboostjobs + - tfjobs/status + - pytorchjobs/status + - mxjobs/status + - xgboostjobs/status + verbs: + - "*" + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - "*" + - apiGroups: + - "" + resources: + - pods + - services + - endpoints + - events + verbs: + - "*" + - apiGroups: + - apps + - extensions + resources: + - deployments + verbs: + - "*" + - apiGroups: + - scheduling.volcano.sh + resources: + - podgroups + verbs: + - "*" diff --git a/manifests/base/crd.yaml b/manifests/base/crd.yaml deleted file mode 100644 index 71384fcfad..0000000000 --- a/manifests/base/crd.yaml +++ /dev/null @@ -1,52 +0,0 @@ -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: tfjobs.kubeflow.org -spec: - additionalPrinterColumns: - - JSONPath: .status.conditions[-1:].type - name: State - type: string - - JSONPath: .metadata.creationTimestamp - name: Age - type: date - group: kubeflow.org - names: - kind: TFJob - plural: tfjobs - singular: tfjob - scope: Namespaced - subresources: - status: {} - validation: - openAPIV3Schema: - properties: - spec: - properties: - tfReplicaSpecs: - properties: - Chief: - properties: - replicas: - maximum: 1 - minimum: 1 - type: integer - PS: - properties: - replicas: - minimum: 1 - type: integer - Worker: - properties: - replicas: - minimum: 1 - type: integer - Evaluator: - properties: - replicas: - minimum: 0 - type: integer - versions: - - name: v1 - served: true - storage: true diff --git a/manifests/base/deployment.yaml b/manifests/base/deployment.yaml deleted file mode 100644 index 346e36476d..0000000000 --- a/manifests/base/deployment.yaml +++ /dev/null @@ -1,29 +0,0 @@ ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: tf-job-operator -spec: - replicas: 1 - template: - metadata: - labels: - name: tf-job-operator - annotations: - sidecar.istio.io/inject: "false" - spec: - containers: - - args: - - -monitoring-port=8443 - env: - - name: MY_POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: MY_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - image: public.ecr.aws/j1r0q0g6/training/tf-operator - name: tf-job-operator - serviceAccountName: tf-job-operator diff --git a/config/crd/bases/kubeflow.org_mxjobs.yaml b/manifests/base/kubeflow.org_mxjobs.yaml similarity index 100% rename from config/crd/bases/kubeflow.org_mxjobs.yaml rename to manifests/base/kubeflow.org_mxjobs.yaml diff --git a/config/crd/bases/kubeflow.org_pytorchjobs.yaml b/manifests/base/kubeflow.org_pytorchjobs.yaml similarity index 100% rename from config/crd/bases/kubeflow.org_pytorchjobs.yaml rename to manifests/base/kubeflow.org_pytorchjobs.yaml diff --git a/config/crd/bases/kubeflow.org_tfjobs.yaml b/manifests/base/kubeflow.org_tfjobs.yaml similarity index 100% rename from config/crd/bases/kubeflow.org_tfjobs.yaml rename to manifests/base/kubeflow.org_tfjobs.yaml diff --git a/config/crd/bases/kubeflow.org_xgboostjobs.yaml b/manifests/base/kubeflow.org_xgboostjobs.yaml similarity index 100% rename from config/crd/bases/kubeflow.org_xgboostjobs.yaml rename to manifests/base/kubeflow.org_xgboostjobs.yaml diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index c1b91df23a..f45174ffa4 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -2,14 +2,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: kubeflow resources: -- crd.yaml -- cluster-role-binding.yaml -- cluster-role.yaml -- deployment.yaml -- service-account.yaml -- service.yaml -commonLabels: - app: tf-job-operator - kustomize.component: tf-job-operator - app.kubernetes.io/component: tfjob - app.kubernetes.io/name: tf-job-operator + - kubeflow.org_tfjobs.yaml + - kubeflow.org_mxjobs.yaml + - kubeflow.org_pytorchjobs.yaml + - kubeflow.org_xgboostjobs.yaml + - cluster-role-binding.yaml + - cluster-role.yaml + - service-account.yaml + - service.yaml diff --git a/manifests/base/service-account.yaml b/manifests/base/service-account.yaml index 2ea6a87d29..e6f10afc28 100644 --- a/manifests/base/service-account.yaml +++ b/manifests/base/service-account.yaml @@ -2,5 +2,5 @@ apiVersion: v1 kind: ServiceAccount metadata: labels: - app: tf-job-operator - name: tf-job-operator + app: training-operator + name: training-operator diff --git a/manifests/base/service.yaml b/manifests/base/service.yaml index 97f92e3ea1..e95f9f4727 100644 --- a/manifests/base/service.yaml +++ b/manifests/base/service.yaml @@ -7,13 +7,13 @@ metadata: prometheus.io/scrape: "true" prometheus.io/port: "8443" labels: - app: tf-job-operator - name: tf-job-operator + app: training-operator + name: training-operator spec: ports: - name: monitoring-port port: 8443 targetPort: 8443 selector: - name: tf-job-operator + name: training-operator type: ClusterIP diff --git a/config/default/kustomization.yaml b/manifests/default/kustomization.yaml similarity index 100% rename from config/default/kustomization.yaml rename to manifests/default/kustomization.yaml diff --git a/config/default/manager_config_patch.yaml b/manifests/default/manager_config_patch.yaml similarity index 100% rename from config/default/manager_config_patch.yaml rename to manifests/default/manager_config_patch.yaml diff --git a/config/manager/controller_manager_config.yaml b/manifests/manager/controller_manager_config.yaml similarity index 100% rename from config/manager/controller_manager_config.yaml rename to manifests/manager/controller_manager_config.yaml diff --git a/config/manager/kustomization.yaml b/manifests/manager/kustomization.yaml similarity index 65% rename from config/manager/kustomization.yaml rename to manifests/manager/kustomization.yaml index 1d1341d9b5..638fcb401e 100644 --- a/config/manager/kustomization.yaml +++ b/manifests/manager/kustomization.yaml @@ -1,5 +1,5 @@ resources: -- manager.yaml + - manager.yaml #generatorOptions: # disableNameSuffixHash: true @@ -10,7 +10,3 @@ resources: # name: manager-config apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -images: -- name: kubeflow/training-operator - newName: kubeflow/training-operator - newTag: latest diff --git a/manifests/manager/manager.yaml b/manifests/manager/manager.yaml new file mode 100644 index 0000000000..129776ad72 --- /dev/null +++ b/manifests/manager/manager.yaml @@ -0,0 +1,50 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: training-operator + namespace: system + labels: + control-plane: kubeflow-training-operator +spec: + selector: + matchLabels: + control-plane: kubeflow-training-operator + replicas: 1 + template: + metadata: + labels: + control-plane: kubeflow-training-operator + spec: + # securityContext: + # runAsNonRoot: true + containers: + - command: + - /manager + # disable leader-elect now + # args: + # - --leader-elect + image: kubeflow/training-operator:v1.0.0 + name: manager + securityContext: + allowPrivilegeEscalation: false + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: 100m + memory: 30Mi + requests: + cpu: 100m + memory: 20Mi + serviceAccountName: training-operator-service-account + terminationGracePeriodSeconds: 10 diff --git a/manifests/overlays/kubeflow/kubeflow-training-roles.yaml b/manifests/overlays/kubeflow/kubeflow-training-roles.yaml new file mode 100644 index 0000000000..05cd2ab778 --- /dev/null +++ b/manifests/overlays/kubeflow/kubeflow-training-roles.yaml @@ -0,0 +1,53 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kubeflow-training-admin + labels: + rbac.authorization.kubeflow.org/aggregate-to-kubeflow-admin: "true" +aggregationRule: + clusterRoleSelectors: + - matchLabels: + rbac.authorization.kubeflow.org/aggregate-to-kubeflow-training-admin: "true" +rules: [] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kubeflow-training-edit + labels: + rbac.authorization.kubeflow.org/aggregate-to-kubeflow-edit: "true" + rbac.authorization.kubeflow.org/aggregate-to-kubeflow-training-admin: "true" +rules: + - apiGroups: + - kubeflow.org + resources: + - tfjobs + - tfjobs/status + verbs: + - get + - list + - watch + - create + - delete + - deletecollection + - patch + - update + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kubeflow-training-view + labels: + rbac.authorization.kubeflow.org/aggregate-to-kubeflow-view: "true" +rules: + - apiGroups: + - kubeflow.org + resources: + - tfjobs + - tfjobs/status + verbs: + - get + - list + - watch diff --git a/manifests/overlays/kubeflow/kustomization.yaml b/manifests/overlays/kubeflow/kustomization.yaml index 380504ca89..7c5e9c1ff6 100644 --- a/manifests/overlays/kubeflow/kustomization.yaml +++ b/manifests/overlays/kubeflow/kustomization.yaml @@ -2,13 +2,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: kubeflow resources: -- ../../base -commonLabels: - app: tf-job-operator - kustomize.component: tf-job-operator - app.kubernetes.io/component: tfjob - app.kubernetes.io/name: tf-job-operator + - ../../base + - ../../manager + - ../../rbac + - kubeflow-training-roles.yaml images: -- name: public.ecr.aws/j1r0q0g6/training/tf-operator - newTag: 47a74b738920edbf4207160cec7e1dff9cdab3f2 - + - name: kubeflow/training-operator:v1.0.0 + newName: deepakmuley/kubeflow-training-operator + newTag: "1" diff --git a/manifests/overlays/standalone/kustomization.yaml b/manifests/overlays/standalone/kustomization.yaml index a37f068715..7eda9f927f 100644 --- a/manifests/overlays/standalone/kustomization.yaml +++ b/manifests/overlays/standalone/kustomization.yaml @@ -2,14 +2,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: kubeflow resources: -- ../../base -- namespace.yaml -commonLabels: - app: tf-job-operator - kustomize.component: tf-job-operator - app.kubernetes.io/component: tfjob - app.kubernetes.io/name: tf-job-operator + - ../../base + - ../../manager + - ../../rbac + - namespace.yaml images: -- name: public.ecr.aws/j1r0q0g6/training/tf-operator - newTag: 47a74b738920edbf4207160cec7e1dff9cdab3f2 - + - name: kubeflow/training-operator:v1.0.0 + newName: deepakmuley/kubeflow-training-operator + newTag: "1" diff --git a/manifests/overlays/standalone_v2/kustomization.yaml b/manifests/overlays/standalone_v2/kustomization.yaml deleted file mode 100644 index 475f3b162b..0000000000 --- a/manifests/overlays/standalone_v2/kustomization.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -namespace: kubeflow -resources: -- ../../base_v2 -- namespace.yaml -commonLabels: - app: training-operator - kustomize.component: training-operator - app.kubernetes.io/component: kubeflow-training - app.kubernetes.io/name: training-operator -images: -- name: public.ecr.aws/j1r0q0g6/training/training-operator - newTag: cd2fc1ff397b1f349f68524f4abd5013a32e3033 - diff --git a/manifests/overlays/standalone_v2/namespace.yaml b/manifests/overlays/standalone_v2/namespace.yaml deleted file mode 100644 index 7a940e4673..0000000000 --- a/manifests/overlays/standalone_v2/namespace.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: kubeflow diff --git a/config/prometheus/kustomization.yaml b/manifests/prometheus/kustomization.yaml similarity index 100% rename from config/prometheus/kustomization.yaml rename to manifests/prometheus/kustomization.yaml diff --git a/config/prometheus/monitor.yaml b/manifests/prometheus/monitor.yaml similarity index 100% rename from config/prometheus/monitor.yaml rename to manifests/prometheus/monitor.yaml diff --git a/config/rbac/auth_proxy_client_clusterrole.yaml b/manifests/rbac/auth_proxy_client_clusterrole.yaml similarity index 100% rename from config/rbac/auth_proxy_client_clusterrole.yaml rename to manifests/rbac/auth_proxy_client_clusterrole.yaml diff --git a/config/rbac/auth_proxy_role.yaml b/manifests/rbac/auth_proxy_role.yaml similarity index 100% rename from config/rbac/auth_proxy_role.yaml rename to manifests/rbac/auth_proxy_role.yaml diff --git a/config/rbac/auth_proxy_role_binding.yaml b/manifests/rbac/auth_proxy_role_binding.yaml similarity index 100% rename from config/rbac/auth_proxy_role_binding.yaml rename to manifests/rbac/auth_proxy_role_binding.yaml diff --git a/config/rbac/auth_proxy_service.yaml b/manifests/rbac/auth_proxy_service.yaml similarity index 100% rename from config/rbac/auth_proxy_service.yaml rename to manifests/rbac/auth_proxy_service.yaml diff --git a/config/rbac/kustomization.yaml b/manifests/rbac/kustomization.yaml similarity index 100% rename from config/rbac/kustomization.yaml rename to manifests/rbac/kustomization.yaml diff --git a/config/rbac/leader_election_role.yaml b/manifests/rbac/leader_election_role.yaml similarity index 100% rename from config/rbac/leader_election_role.yaml rename to manifests/rbac/leader_election_role.yaml diff --git a/config/rbac/leader_election_role_binding.yaml b/manifests/rbac/leader_election_role_binding.yaml similarity index 100% rename from config/rbac/leader_election_role_binding.yaml rename to manifests/rbac/leader_election_role_binding.yaml diff --git a/config/rbac/mxjob_editor_role.yaml b/manifests/rbac/mxjob_editor_role.yaml similarity index 100% rename from config/rbac/mxjob_editor_role.yaml rename to manifests/rbac/mxjob_editor_role.yaml diff --git a/config/rbac/mxjob_viewer_role.yaml b/manifests/rbac/mxjob_viewer_role.yaml similarity index 100% rename from config/rbac/mxjob_viewer_role.yaml rename to manifests/rbac/mxjob_viewer_role.yaml diff --git a/config/rbac/pytorchjob_editor_role.yaml b/manifests/rbac/pytorchjob_editor_role.yaml similarity index 100% rename from config/rbac/pytorchjob_editor_role.yaml rename to manifests/rbac/pytorchjob_editor_role.yaml diff --git a/config/rbac/pytorchjob_viewer_role.yaml b/manifests/rbac/pytorchjob_viewer_role.yaml similarity index 100% rename from config/rbac/pytorchjob_viewer_role.yaml rename to manifests/rbac/pytorchjob_viewer_role.yaml diff --git a/config/rbac/role.yaml b/manifests/rbac/role.yaml similarity index 100% rename from config/rbac/role.yaml rename to manifests/rbac/role.yaml diff --git a/config/rbac/role_binding.yaml b/manifests/rbac/role_binding.yaml similarity index 100% rename from config/rbac/role_binding.yaml rename to manifests/rbac/role_binding.yaml diff --git a/config/rbac/service_account.yaml b/manifests/rbac/service_account.yaml similarity index 100% rename from config/rbac/service_account.yaml rename to manifests/rbac/service_account.yaml diff --git a/config/rbac/tfjob_editor_role.yaml b/manifests/rbac/tfjob_editor_role.yaml similarity index 100% rename from config/rbac/tfjob_editor_role.yaml rename to manifests/rbac/tfjob_editor_role.yaml diff --git a/config/rbac/tfjob_viewer_role.yaml b/manifests/rbac/tfjob_viewer_role.yaml similarity index 100% rename from config/rbac/tfjob_viewer_role.yaml rename to manifests/rbac/tfjob_viewer_role.yaml diff --git a/config/rbac/xgboostjob_editor_role.yaml b/manifests/rbac/xgboostjob_editor_role.yaml similarity index 100% rename from config/rbac/xgboostjob_editor_role.yaml rename to manifests/rbac/xgboostjob_editor_role.yaml diff --git a/config/rbac/xgboostjob_viewer_role.yaml b/manifests/rbac/xgboostjob_viewer_role.yaml similarity index 100% rename from config/rbac/xgboostjob_viewer_role.yaml rename to manifests/rbac/xgboostjob_viewer_role.yaml diff --git a/scripts/setup-training-operator.sh b/scripts/setup-training-operator.sh index d310cce152..7f9c8b5deb 100755 --- a/scripts/setup-training-operator.sh +++ b/scripts/setup-training-operator.sh @@ -32,7 +32,7 @@ echo "Configuring kubeconfig.." aws eks update-kubeconfig --region=${REGION} --name=${CLUSTER_NAME} echo "Update training operator manifest with new name $REGISTRY and tag $VERSION" -cd config/default +cd manifests/overlays/standalone #kustomize edit set image public.ecr.aws/j1r0q0g6/training/training-operator=${REGISTRY}:${VERSION} kustomize edit set image kubeflow/training-operator=${REGISTRY}:${VERSION} From e78151fb989b219f516ce77d22f1bdbfb4821215 Mon Sep 17 00:00:00 2001 From: Jiaxin Shan Date: Fri, 13 Aug 2021 15:12:04 -0700 Subject: [PATCH 3/3] Clean up manifests and remove unused files (#1349) * Clean up manifests * Remove prometheus and manager folder * Update override image tag for kubeflow manifest * Delete leader election role * Remove non-exist tag in deployment --- manifests/base/cluster-role-binding.yaml | 2 +- manifests/base/cluster-role.yaml | 16 +++--- .../manager.yaml => base/deployment.yaml} | 23 +++++---- manifests/base/kustomization.yaml | 1 + manifests/default/kustomization.yaml | 11 ----- manifests/default/manager_config_patch.yaml | 20 -------- .../manager/controller_manager_config.yaml | 11 ----- manifests/manager/kustomization.yaml | 12 ----- .../kubeflow/kubeflow-training-roles.yaml | 33 ++++++++++--- .../overlays/kubeflow/kustomization.yaml | 8 ++- .../overlays/standalone/kustomization.yaml | 8 ++- manifests/prometheus/kustomization.yaml | 2 - manifests/prometheus/monitor.yaml | 20 -------- .../rbac/auth_proxy_client_clusterrole.yaml | 9 ---- manifests/rbac/auth_proxy_role.yaml | 17 ------- manifests/rbac/auth_proxy_role_binding.yaml | 12 ----- manifests/rbac/auth_proxy_service.yaml | 14 ------ manifests/rbac/kustomization.yaml | 19 ------- manifests/rbac/leader_election_role.yaml | 37 -------------- .../rbac/leader_election_role_binding.yaml | 12 ----- manifests/rbac/mxjob_editor_role.yaml | 24 --------- manifests/rbac/mxjob_viewer_role.yaml | 20 -------- manifests/rbac/pytorchjob_editor_role.yaml | 24 --------- manifests/rbac/pytorchjob_viewer_role.yaml | 20 -------- manifests/rbac/role.yaml | 49 ------------------- manifests/rbac/role_binding.yaml | 12 ----- manifests/rbac/service_account.yaml | 5 -- manifests/rbac/tfjob_editor_role.yaml | 24 --------- manifests/rbac/tfjob_viewer_role.yaml | 20 -------- manifests/rbac/xgboostjob_editor_role.yaml | 24 --------- manifests/rbac/xgboostjob_viewer_role.yaml | 20 -------- 31 files changed, 57 insertions(+), 472 deletions(-) rename manifests/{manager/manager.yaml => base/deployment.yaml} (68%) delete mode 100644 manifests/default/kustomization.yaml delete mode 100644 manifests/default/manager_config_patch.yaml delete mode 100644 manifests/manager/controller_manager_config.yaml delete mode 100644 manifests/manager/kustomization.yaml delete mode 100644 manifests/prometheus/kustomization.yaml delete mode 100644 manifests/prometheus/monitor.yaml delete mode 100644 manifests/rbac/auth_proxy_client_clusterrole.yaml delete mode 100644 manifests/rbac/auth_proxy_role.yaml delete mode 100644 manifests/rbac/auth_proxy_role_binding.yaml delete mode 100644 manifests/rbac/auth_proxy_service.yaml delete mode 100644 manifests/rbac/kustomization.yaml delete mode 100644 manifests/rbac/leader_election_role.yaml delete mode 100644 manifests/rbac/leader_election_role_binding.yaml delete mode 100644 manifests/rbac/mxjob_editor_role.yaml delete mode 100644 manifests/rbac/mxjob_viewer_role.yaml delete mode 100644 manifests/rbac/pytorchjob_editor_role.yaml delete mode 100644 manifests/rbac/pytorchjob_viewer_role.yaml delete mode 100644 manifests/rbac/role.yaml delete mode 100644 manifests/rbac/role_binding.yaml delete mode 100644 manifests/rbac/service_account.yaml delete mode 100644 manifests/rbac/tfjob_editor_role.yaml delete mode 100644 manifests/rbac/tfjob_viewer_role.yaml delete mode 100644 manifests/rbac/xgboostjob_editor_role.yaml delete mode 100644 manifests/rbac/xgboostjob_viewer_role.yaml diff --git a/manifests/base/cluster-role-binding.yaml b/manifests/base/cluster-role-binding.yaml index 2e5b93dcb5..97b47bad69 100644 --- a/manifests/base/cluster-role-binding.yaml +++ b/manifests/base/cluster-role-binding.yaml @@ -1,5 +1,5 @@ --- -apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: labels: diff --git a/manifests/base/cluster-role.yaml b/manifests/base/cluster-role.yaml index 702a3f8cd5..bf8eb92b84 100644 --- a/manifests/base/cluster-role.yaml +++ b/manifests/base/cluster-role.yaml @@ -1,5 +1,5 @@ --- -apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: @@ -18,13 +18,13 @@ rules: - mxjobs/status - xgboostjobs/status verbs: - - "*" - - apiGroups: - - apiextensions.k8s.io - resources: - - customresourcedefinitions - verbs: - - "*" + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - "" resources: diff --git a/manifests/manager/manager.yaml b/manifests/base/deployment.yaml similarity index 68% rename from manifests/manager/manager.yaml rename to manifests/base/deployment.yaml index 129776ad72..416d5ebf5e 100644 --- a/manifests/manager/manager.yaml +++ b/manifests/base/deployment.yaml @@ -2,7 +2,6 @@ apiVersion: apps/v1 kind: Deployment metadata: name: training-operator - namespace: system labels: control-plane: kubeflow-training-operator spec: @@ -14,17 +13,23 @@ spec: metadata: labels: control-plane: kubeflow-training-operator + annotations: + sidecar.istio.io/inject: "false" spec: - # securityContext: - # runAsNonRoot: true containers: - command: - /manager - # disable leader-elect now - # args: - # - --leader-elect - image: kubeflow/training-operator:v1.0.0 - name: manager + image: kubeflow/training-operator + name: training-operator + env: + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name securityContext: allowPrivilegeEscalation: false livenessProbe: @@ -46,5 +51,5 @@ spec: requests: cpu: 100m memory: 20Mi - serviceAccountName: training-operator-service-account + serviceAccountName: training-operator terminationGracePeriodSeconds: 10 diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index f45174ffa4..95653c561b 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -10,3 +10,4 @@ resources: - cluster-role.yaml - service-account.yaml - service.yaml + - deployment.yaml diff --git a/manifests/default/kustomization.yaml b/manifests/default/kustomization.yaml deleted file mode 100644 index b109d900be..0000000000 --- a/manifests/default/kustomization.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# Adds namespace to all resources. -namespace: kubeflow - -# Labels to add to all resources and selectors. -#commonLabels: -# someName: someValue - -resources: -- ../crd -- ../rbac -- ../manager diff --git a/manifests/default/manager_config_patch.yaml b/manifests/default/manager_config_patch.yaml deleted file mode 100644 index 6c400155cf..0000000000 --- a/manifests/default/manager_config_patch.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system -spec: - template: - spec: - containers: - - name: manager - args: - - "--config=controller_manager_config.yaml" - volumeMounts: - - name: manager-config - mountPath: /controller_manager_config.yaml - subPath: controller_manager_config.yaml - volumes: - - name: manager-config - configMap: - name: manager-config diff --git a/manifests/manager/controller_manager_config.yaml b/manifests/manager/controller_manager_config.yaml deleted file mode 100644 index 71b4595f10..0000000000 --- a/manifests/manager/controller_manager_config.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: controller-runtime.sigs.k8s.io/v1alpha1 -kind: ControllerManagerConfig -health: - healthProbeBindAddress: :8081 -metrics: - bindAddress: 127.0.0.1:8080 -webhook: - port: 9443 -leaderElection: - leaderElect: true - resourceName: 1ca428e5. diff --git a/manifests/manager/kustomization.yaml b/manifests/manager/kustomization.yaml deleted file mode 100644 index 638fcb401e..0000000000 --- a/manifests/manager/kustomization.yaml +++ /dev/null @@ -1,12 +0,0 @@ -resources: - - manager.yaml - -#generatorOptions: -# disableNameSuffixHash: true -# -#configMapGenerator: -#- files: -# - controller_manager_config.yaml -# name: manager-config -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization diff --git a/manifests/overlays/kubeflow/kubeflow-training-roles.yaml b/manifests/overlays/kubeflow/kubeflow-training-roles.yaml index 05cd2ab778..cb287d7a0d 100644 --- a/manifests/overlays/kubeflow/kubeflow-training-roles.yaml +++ b/manifests/overlays/kubeflow/kubeflow-training-roles.yaml @@ -23,16 +23,26 @@ rules: - kubeflow.org resources: - tfjobs - - tfjobs/status + - pytorchjobs + - mxjobs + - xgboostjobs verbs: - - get - - list - - watch - create - delete - - deletecollection + - get + - list - patch - update + - watch + - apiGroups: + - kubeflow.org + resources: + - tfjobs/status + - pytorchjobs/status + - mxjobs/status + - xgboostjobs/status + verbs: + - get --- apiVersion: rbac.authorization.k8s.io/v1 @@ -46,8 +56,19 @@ rules: - kubeflow.org resources: - tfjobs - - tfjobs/status + - pytorchjobs + - mxjobs + - xgboostjobs verbs: - get - list - watch + - apiGroups: + - kubeflow.org + resources: + - tfjobs/status + - pytorchjobs/status + - mxjobs/status + - xgboostjobs/status + verbs: + - get diff --git a/manifests/overlays/kubeflow/kustomization.yaml b/manifests/overlays/kubeflow/kustomization.yaml index 7c5e9c1ff6..6866f3d527 100644 --- a/manifests/overlays/kubeflow/kustomization.yaml +++ b/manifests/overlays/kubeflow/kustomization.yaml @@ -3,10 +3,8 @@ kind: Kustomization namespace: kubeflow resources: - ../../base - - ../../manager - - ../../rbac - kubeflow-training-roles.yaml images: - - name: kubeflow/training-operator:v1.0.0 - newName: deepakmuley/kubeflow-training-operator - newTag: "1" + - name: kubeflow/training-operator + newName: kubeflow/training-operator + newTag: "46c586463e3c13b0d4f2d534aae84a46329dc4ef" diff --git a/manifests/overlays/standalone/kustomization.yaml b/manifests/overlays/standalone/kustomization.yaml index 7eda9f927f..51b3b6e7d7 100644 --- a/manifests/overlays/standalone/kustomization.yaml +++ b/manifests/overlays/standalone/kustomization.yaml @@ -3,10 +3,8 @@ kind: Kustomization namespace: kubeflow resources: - ../../base - - ../../manager - - ../../rbac - namespace.yaml images: - - name: kubeflow/training-operator:v1.0.0 - newName: deepakmuley/kubeflow-training-operator - newTag: "1" + - name: kubeflow/training-operator + newName: kubeflow/training-operator + newTag: "46c586463e3c13b0d4f2d534aae84a46329dc4ef" diff --git a/manifests/prometheus/kustomization.yaml b/manifests/prometheus/kustomization.yaml deleted file mode 100644 index ed137168a1..0000000000 --- a/manifests/prometheus/kustomization.yaml +++ /dev/null @@ -1,2 +0,0 @@ -resources: -- monitor.yaml diff --git a/manifests/prometheus/monitor.yaml b/manifests/prometheus/monitor.yaml deleted file mode 100644 index d19136ae71..0000000000 --- a/manifests/prometheus/monitor.yaml +++ /dev/null @@ -1,20 +0,0 @@ - -# Prometheus Monitor Service (Metrics) -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - control-plane: controller-manager - name: controller-manager-metrics-monitor - namespace: system -spec: - endpoints: - - path: /metrics - port: https - scheme: https - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - tlsConfig: - insecureSkipVerify: true - selector: - matchLabels: - control-plane: controller-manager diff --git a/manifests/rbac/auth_proxy_client_clusterrole.yaml b/manifests/rbac/auth_proxy_client_clusterrole.yaml deleted file mode 100644 index 51a75db47a..0000000000 --- a/manifests/rbac/auth_proxy_client_clusterrole.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: metrics-reader -rules: -- nonResourceURLs: - - "/metrics" - verbs: - - get diff --git a/manifests/rbac/auth_proxy_role.yaml b/manifests/rbac/auth_proxy_role.yaml deleted file mode 100644 index 80e1857c59..0000000000 --- a/manifests/rbac/auth_proxy_role.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: proxy-role -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create diff --git a/manifests/rbac/auth_proxy_role_binding.yaml b/manifests/rbac/auth_proxy_role_binding.yaml deleted file mode 100644 index ec7acc0a1b..0000000000 --- a/manifests/rbac/auth_proxy_role_binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: proxy-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/manifests/rbac/auth_proxy_service.yaml b/manifests/rbac/auth_proxy_service.yaml deleted file mode 100644 index 6cf656be14..0000000000 --- a/manifests/rbac/auth_proxy_service.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - control-plane: controller-manager - name: controller-manager-metrics-service - namespace: system -spec: - ports: - - name: https - port: 8443 - targetPort: https - selector: - control-plane: controller-manager diff --git a/manifests/rbac/kustomization.yaml b/manifests/rbac/kustomization.yaml deleted file mode 100644 index 1df40326d8..0000000000 --- a/manifests/rbac/kustomization.yaml +++ /dev/null @@ -1,19 +0,0 @@ -resources: -# All RBAC will be applied under this service account in -# the deployment namespace. You may comment out this resource -# if your manager will use a service account that exists at -# runtime. Be sure to update RoleBinding and ClusterRoleBinding -# subjects if changing service account names. -- service_account.yaml -# role.yaml doesn't exist, we should use aggregation role instead. -- role.yaml -- role_binding.yaml -- leader_election_role.yaml -- leader_election_role_binding.yaml -# Comment the following 4 lines if you want to disable -# the auth proxy (https://github.com/brancz/kube-rbac-proxy) -# which protects your /metrics endpoint. -#- auth_proxy_service.yaml -#- auth_proxy_role.yaml -#- auth_proxy_role_binding.yaml -#- auth_proxy_client_clusterrole.yaml diff --git a/manifests/rbac/leader_election_role.yaml b/manifests/rbac/leader_election_role.yaml deleted file mode 100644 index 4190ec8059..0000000000 --- a/manifests/rbac/leader_election_role.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# permissions to do leader election. -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: leader-election-role -rules: -- apiGroups: - - "" - resources: - - configmaps - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch diff --git a/manifests/rbac/leader_election_role_binding.yaml b/manifests/rbac/leader_election_role_binding.yaml deleted file mode 100644 index 1d1321ed4f..0000000000 --- a/manifests/rbac/leader_election_role_binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: leader-election-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: leader-election-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/manifests/rbac/mxjob_editor_role.yaml b/manifests/rbac/mxjob_editor_role.yaml deleted file mode 100644 index 604605005d..0000000000 --- a/manifests/rbac/mxjob_editor_role.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# permissions for end users to edit mxjobs. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: mxjob-editor-role -rules: -- apiGroups: - - kubeflow.org - resources: - - mxjobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - kubeflow.org - resources: - - mxjobs/status - verbs: - - get diff --git a/manifests/rbac/mxjob_viewer_role.yaml b/manifests/rbac/mxjob_viewer_role.yaml deleted file mode 100644 index bd4e695a50..0000000000 --- a/manifests/rbac/mxjob_viewer_role.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# permissions for end users to view mxjobs. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: mxjob-viewer-role -rules: -- apiGroups: - - kubeflow.org - resources: - - mxjobs - verbs: - - get - - list - - watch -- apiGroups: - - kubeflow.org - resources: - - mxjobs/status - verbs: - - get diff --git a/manifests/rbac/pytorchjob_editor_role.yaml b/manifests/rbac/pytorchjob_editor_role.yaml deleted file mode 100644 index b9b132994a..0000000000 --- a/manifests/rbac/pytorchjob_editor_role.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# permissions for end users to edit pytorchjobs. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: pytorchjob-editor-role -rules: -- apiGroups: - - kubeflow.org - resources: - - pytorchjobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - kubeflow.org - resources: - - pytorchjobs/status - verbs: - - get diff --git a/manifests/rbac/pytorchjob_viewer_role.yaml b/manifests/rbac/pytorchjob_viewer_role.yaml deleted file mode 100644 index fec8293056..0000000000 --- a/manifests/rbac/pytorchjob_viewer_role.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# permissions for end users to view pytorchjobs. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: pytorchjob-viewer-role -rules: -- apiGroups: - - kubeflow.org - resources: - - pytorchjobs - verbs: - - get - - list - - watch -- apiGroups: - - kubeflow.org - resources: - - pytorchjobs/status - verbs: - - get diff --git a/manifests/rbac/role.yaml b/manifests/rbac/role.yaml deleted file mode 100644 index bcd0476c29..0000000000 --- a/manifests/rbac/role.yaml +++ /dev/null @@ -1,49 +0,0 @@ - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: training-operator-clusterrole -rules: - - apiGroups: - - kubeflow.org - resources: - - tfjobs - - mxjobs - - pytorchjobs - - xgboostjobs - - tfjobs/status - - pytorchjobs/status - - mxjobs/status - - xgboostjobs/status - verbs: - - create - - delete - - get - - list - - patch - - update - - watch - - - apiGroups: - - "" - resources: - - pods - - services - - endpoints - - events - verbs: - - '*' - - apiGroups: - - apps - - extensions - resources: - - deployments - verbs: - - '*' - - apiGroups: - - scheduling.volcano.sh - resources: - - podgroups - verbs: - - '*' \ No newline at end of file diff --git a/manifests/rbac/role_binding.yaml b/manifests/rbac/role_binding.yaml deleted file mode 100644 index b6f45bda94..0000000000 --- a/manifests/rbac/role_binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: training-operator-clusterrolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: training-operator-clusterrole -subjects: -- kind: ServiceAccount - name: training-operator-service-account - namespace: system diff --git a/manifests/rbac/service_account.yaml b/manifests/rbac/service_account.yaml deleted file mode 100644 index 1e7e7170ad..0000000000 --- a/manifests/rbac/service_account.yaml +++ /dev/null @@ -1,5 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: training-operator-service-account - namespace: system diff --git a/manifests/rbac/tfjob_editor_role.yaml b/manifests/rbac/tfjob_editor_role.yaml deleted file mode 100644 index 3a9d0e5797..0000000000 --- a/manifests/rbac/tfjob_editor_role.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# permissions for end users to edit tfjobs. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: tfjob-editor-role -rules: -- apiGroups: - - kubeflow.org - resources: - - tfjobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - kubeflow.org - resources: - - tfjobs/status - verbs: - - get diff --git a/manifests/rbac/tfjob_viewer_role.yaml b/manifests/rbac/tfjob_viewer_role.yaml deleted file mode 100644 index 2809b2d510..0000000000 --- a/manifests/rbac/tfjob_viewer_role.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# permissions for end users to view tfjobs. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: tfjob-viewer-role -rules: -- apiGroups: - - kubeflow.org - resources: - - tfjobs - verbs: - - get - - list - - watch -- apiGroups: - - kubeflow.org - resources: - - tfjobs/status - verbs: - - get diff --git a/manifests/rbac/xgboostjob_editor_role.yaml b/manifests/rbac/xgboostjob_editor_role.yaml deleted file mode 100644 index 2828f28ccb..0000000000 --- a/manifests/rbac/xgboostjob_editor_role.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# permissions for end users to edit xgboostjobs. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: xgboostjob-editor-role -rules: -- apiGroups: - - kubeflow.org - resources: - - xgboostjobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - kubeflow.org - resources: - - xgboostjobs/status - verbs: - - get diff --git a/manifests/rbac/xgboostjob_viewer_role.yaml b/manifests/rbac/xgboostjob_viewer_role.yaml deleted file mode 100644 index 9380087368..0000000000 --- a/manifests/rbac/xgboostjob_viewer_role.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# permissions for end users to view xgboostjobs. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: xgboostjob-viewer-role -rules: -- apiGroups: - - kubeflow.org - resources: - - xgboostjobs - verbs: - - get - - list - - watch -- apiGroups: - - kubeflow.org - resources: - - xgboostjobs/status - verbs: - - get