From 36e4f8d2b3caec6d3129ce4eb5134ac7789d7c15 Mon Sep 17 00:00:00 2001 From: Austin Godber Date: Thu, 23 Jul 2020 12:39:15 -0700 Subject: [PATCH 01/11] handle changes to assets --- examples/k8s/Makefile | 6 +++--- examples/k8s/asset/asset/asset.json | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/k8s/Makefile b/examples/k8s/Makefile index c02b5f95349..d04a2614219 100644 --- a/examples/k8s/Makefile +++ b/examples/k8s/Makefile @@ -178,8 +178,8 @@ rebuild: destroy build setup ## destroys then re-runs things make register register: ## creates asset and registers job - earl assets deploy ${TERASLICE_ALIAS} terascope/elasticsearch-assets - earl assets deploy ${TERASLICE_ALIAS} --build --replace --src-dir asset/ + earl assets deploy ${TERASLICE_ALIAS} --blocking terascope/elasticsearch-assets + earl assets deploy ${TERASLICE_ALIAS} --blocking --build --replace --src-dir asset/ earl tjm register ${TERASLICE_ALIAS} example-job.json earl tjm register ${TERASLICE_ALIAS} example-job-labels.json earl tjm register ${TERASLICE_ALIAS} example-job-resource.json @@ -195,7 +195,7 @@ deregister: ## resets jobs start: ## starts example job # yes | tjm asset --replace -c $(TERASLICE_MASTER_URL) || echo '* it is okay' - earl assets deploy ${TERASLICE_ALIAS} --build --replace --src-dir asset/ + earl assets deploy ${TERASLICE_ALIAS} --blocking --build --replace --src-dir asset/ earl tjm start example-job.json stop: ## stops example job diff --git a/examples/k8s/asset/asset/asset.json b/examples/k8s/asset/asset/asset.json index d563e32ef45..b2c6845567b 100644 --- a/examples/k8s/asset/asset/asset.json +++ b/examples/k8s/asset/asset/asset.json @@ -1,4 +1,7 @@ { "name": "example", - "version": "1.0.0" + "version": "1.0.0", + "node_version": false, + "platform": false, + "arch": false } From ef3e918d5beb1671ae60cd7595c8e4ec3f7b4273 Mon Sep 17 00:00:00 2001 From: Austin Godber Date: Thu, 23 Jul 2020 13:00:49 -0700 Subject: [PATCH 02/11] add waitForNumPods --- .../cluster/backends/kubernetes/k8s.js | 71 +++++++++++++++++-- 1 file changed, 67 insertions(+), 4 deletions(-) diff --git a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js index b21882183b8..9aa51a567ba 100644 --- a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js +++ b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js @@ -64,11 +64,14 @@ class K8s { * Rerturns the first pod matching the provided selector after it has * entered the `Running` state. * + * TODO: Make more generic to search for different statuses + * * NOTE: If your selector will return multiple pods, this method probably * won't work for you. * @param {String} selector kubernetes selector, like 'controller-uid=XXX' * @param {String} ns namespace to search, this will override the default * @param {Number} timeout time, in ms, to wait for pod to start + * @return {Object} pod */ async waitForSelectedPod(selector, ns, timeout = 10000) { const namespace = ns || this.defaultNamespace; @@ -97,6 +100,41 @@ class K8s { } } + /** + * Waits for the number of pods to equal number. + * @param {Number} number Number of pods to wait for, e.g.: 0, 10 + * @param {String} selector kubernetes selector, like 'controller-uid=XXX' + * @param {String} ns namespace to search, this will override the default + * @param {Number} timeout time, in ms, to wait for pod to start + * @return {Array} Array of pod objects + */ + async waitForNumPods(number, selector, ns, timeout = 10000) { + const namespace = ns || this.defaultNamespace; + let now = Date.now(); + const end = now + timeout; + + // eslint-disable-next-line no-constant-condition + while (true) { + const result = await this.client.api.v1.namespaces(namespace) + .pods().get({ qs: { labelSelector: selector } }); + + let podList; + if (typeof result !== 'undefined' && result) { + podList = get(result, 'body.items'); + } + + if (typeof podList !== 'undefined' && podList) { + if (podList.length === number) return podList; + } + const msg = `Waiting: pods matching ${selector} is ${podList.length}/${number}`; + if (now > end) throw new Error(`Timeout ${msg}`); + this.logger.debug(msg); + + await pDelay(this.apiPollDelay); + now = Date.now(); + } + } + /** * returns list of k8s objects matching provided selector * @param {String} selector kubernetes selector, like 'app=teraslice' @@ -265,14 +303,39 @@ class K8s { * @return {Promise} */ async deleteExecution(exId) { + const r = []; if (!exId) { throw new Error('deleteExecution requires an executionId'); } - return Promise.all([ - this._deleteObjByExId(exId, 'worker', 'deployments'), - this._deleteObjByExId(exId, 'execution_controller', 'jobs'), - ]); + try { + this.logger.info(`Deleting worker deployment for ex_id: ${exId}`); + r.push(await this._deleteObjByExId(exId, 'worker', 'deployments')); + } catch (e) { + const err = new Error(`Error deleting workers: ${e}`); + this.logger.error(err); + return Promise.reject(err); + } + + const podList = await this.waitForNumPods( + 0, + `app.kubernetes.io/component=worker,teraslice.terascope.io/exId=${exId}`, + null, + 600000 // 10 minutes, I wanted this to be longer than the pod shutdown timeout + ); + + this.logger.info(`podLists: ${podList}`); + + try { + this.logger.info(`Deleting execution controller job for ex_id: ${exId}`); + r.push(await this._deleteObjByExId(exId, 'execution_controller', 'jobs')); + } catch (e) { + const err = new Error(`Error deleting execution controller: ${e}`); + this.logger.error(err); + return Promise.reject(err); + } + + return r; } /** From b7a5ee28a07ecf29abd5bab67bc95d78e00dc779 Mon Sep 17 00:00:00 2001 From: Austin Godber Date: Mon, 27 Jul 2020 16:48:37 -0700 Subject: [PATCH 03/11] Update job with delay and update logging I also remove the test for the k8s implementation of deleteExecution. Mocking that would be pretty messy. --- examples/k8s/example-job.json | 4 ++++ .../cluster/backends/kubernetes/k8s.js | 11 ++++++++--- .../cluster/backends/kubernetes/k8s-spec.js | 19 ------------------- 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/examples/k8s/example-job.json b/examples/k8s/example-job.json index deb1d9ff902..349636dc0d8 100644 --- a/examples/k8s/example-job.json +++ b/examples/k8s/example-job.json @@ -14,6 +14,10 @@ { "_op": "example-op" }, + { + "_op": "delay", + "ms": 30000 + }, { "_op": "elasticsearch_index_selector", "index": "terak8s-example-data", diff --git a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js index 9aa51a567ba..49f5c4bcb3a 100644 --- a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js +++ b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js @@ -299,6 +299,12 @@ class K8s { /** * Delete all of the deployments and services related to the specified exId + * + * The process here waits for the worker pods to completely exit before + * terminating the execution controller pod. The intent is to avoid having + * a worker timeout when it tries to tell the execution controller it is + * exiting. + * * @param {String} exId ID of the execution * @return {Promise} */ @@ -317,15 +323,13 @@ class K8s { return Promise.reject(err); } - const podList = await this.waitForNumPods( + await this.waitForNumPods( 0, `app.kubernetes.io/component=worker,teraslice.terascope.io/exId=${exId}`, null, 600000 // 10 minutes, I wanted this to be longer than the pod shutdown timeout ); - this.logger.info(`podLists: ${podList}`); - try { this.logger.info(`Deleting execution controller job for ex_id: ${exId}`); r.push(await this._deleteObjByExId(exId, 'execution_controller', 'jobs')); @@ -335,6 +339,7 @@ class K8s { return Promise.reject(err); } + this.logger.debug(`Deleted Resources:\n${r.map((x) => JSON.stringify(x, null, 2))}`); return r; } diff --git a/packages/teraslice/test/lib/cluster/services/cluster/backends/kubernetes/k8s-spec.js b/packages/teraslice/test/lib/cluster/services/cluster/backends/kubernetes/k8s-spec.js index dcac3569645..57ab757a06d 100644 --- a/packages/teraslice/test/lib/cluster/services/cluster/backends/kubernetes/k8s-spec.js +++ b/packages/teraslice/test/lib/cluster/services/cluster/backends/kubernetes/k8s-spec.js @@ -223,23 +223,4 @@ describe('k8s', () => { expect(response.spec.replicas).toEqual(3); }); }); - - describe('->deleteExecution', () => { - it('can delete an execution', async () => { - nock(_url) - .get('/apis/apps/v1/namespaces/default/deployments/') - .query({ labelSelector: /app\.kubernetes\.io\/component=worker,teraslice\.terascope\.io\/exId=.*/ }) - .reply(200, { kind: 'DeploymentList', items: [{ metadata: { name: 'e33b5454' } }] }) - .get('/apis/batch/v1/namespaces/default/jobs/') - .query({ labelSelector: /app\.kubernetes\.io\/component=execution_controller,teraslice\.terascope\.io\/exId=.*/ }) - .reply(200, { kind: 'JobList', items: [{ metadata: { name: 'e33b5454' } }] }) - .delete('/apis/apps/v1/namespaces/default/deployments/e33b5454') - .reply(200, {}) - .delete('/apis/batch/v1/namespaces/default/jobs/e33b5454') - .reply(200, {}); - - const response = await k8s.deleteExecution('e33b5454'); - expect(response).toEqual([{}, {}]); - }); - }); }); From 1a23d2f76e04d32302cbc8a28a57a71b98033c3a Mon Sep 17 00:00:00 2001 From: Austin Godber Date: Mon, 27 Jul 2020 17:55:55 -0700 Subject: [PATCH 04/11] Minor logging change. --- .../lib/cluster/services/cluster/backends/kubernetes/k8s.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js index 49f5c4bcb3a..0f1f6e2df35 100644 --- a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js +++ b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js @@ -339,7 +339,7 @@ class K8s { return Promise.reject(err); } - this.logger.debug(`Deleted Resources:\n${r.map((x) => JSON.stringify(x, null, 2))}`); + this.logger.debug(`Deleted Resources:\n\n${r.map((x) => JSON.stringify(x, null, 2))}`); return r; } From 43a7b182827c7038dd3a5168641ff5750f223702 Mon Sep 17 00:00:00 2001 From: Austin Godber Date: Tue, 28 Jul 2020 10:33:13 -0700 Subject: [PATCH 05/11] release: (minor) teraslice@0.69.0 --- package.json | 2 +- packages/teraslice/package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index c188725a7e4..07f5df50b2e 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "teraslice-workspace", "displayName": "Teraslice", - "version": "0.68.1", + "version": "0.69.0", "private": true, "homepage": "https://github.com/terascope/teraslice", "bugs": { diff --git a/packages/teraslice/package.json b/packages/teraslice/package.json index 8e09fffe7eb..4173622bfa8 100644 --- a/packages/teraslice/package.json +++ b/packages/teraslice/package.json @@ -1,7 +1,7 @@ { "name": "teraslice", "displayName": "Teraslice", - "version": "0.68.1", + "version": "0.69.0", "description": "Distributed computing platform for processing JSON data", "homepage": "https://github.com/terascope/teraslice#readme", "bugs": { From 869ff31081a51dbca41882aa479181bd20c59beb Mon Sep 17 00:00:00 2001 From: Austin Godber Date: Thu, 30 Jul 2020 15:51:15 -0700 Subject: [PATCH 06/11] add pRetry and worker shutdown polling change I've added pRetry everywhere but POST. Which should be safe. I now drive the worker pod polling with the default shutdown timeout. --- .../cluster/backends/kubernetes/index.js | 9 ++- .../cluster/backends/kubernetes/k8s.js | 77 +++++++++++-------- .../cluster/backends/kubernetes/utils.js | 19 ++++- 3 files changed, 68 insertions(+), 37 deletions(-) diff --git a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/index.js b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/index.js index b4f10e2cf23..f4959650eb1 100644 --- a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/index.js +++ b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/index.js @@ -27,8 +27,13 @@ module.exports = function kubernetesClusterBackend(context, clusterMasterServer) const clusterState = {}; let clusterStateInterval = null; - const k8s = new K8s(logger, null, kubernetesNamespace, - context.sysconfig.teraslice.kubernetes_api_poll_delay); + const k8s = new K8s( + logger, + null, + kubernetesNamespace, + context.sysconfig.teraslice.kubernetes_api_poll_delay, + context.sysconfig.teraslice.shutdown_timeout + ); clusterMasterServer.onClientOnline((exId) => { logger.info(`execution ${exId} is connected`); diff --git a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js index 0f1f6e2df35..bbd2c02fef2 100644 --- a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js +++ b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js @@ -1,16 +1,19 @@ 'use strict'; const { - TSError, get, isEmpty, pDelay + TSError, get, isEmpty, pDelay, pRetry } = require('@terascope/utils'); const { Client, KubeConfig } = require('kubernetes-client'); const Request = require('kubernetes-client/backends/request'); +const { getRetryConfig } = require('./utils'); class K8s { - constructor(logger, clientConfig, defaultNamespace = 'default', apiPollDelay) { + constructor(logger, clientConfig, defaultNamespace = 'default', + apiPollDelay, shutdownTimeout) { this.apiPollDelay = apiPollDelay; - this.logger = logger; this.defaultNamespace = defaultNamespace; + this.logger = logger; + this.shutdownTimeout = shutdownTimeout; // this is in milliseconds if (clientConfig) { this.client = new Client({ @@ -50,7 +53,8 @@ class K8s { async getNamespaces() { let namespaces; try { - namespaces = await this.client.api.v1.namespaces.get(); + namespaces = await pRetry(() => this.client + .api.v1.namespaces.get(), getRetryConfig()); } catch (err) { const error = new TSError(err, { reason: 'Failure getting in namespaces' @@ -80,8 +84,9 @@ class K8s { // eslint-disable-next-line no-constant-condition while (true) { - const result = await this.client.api.v1.namespaces(namespace) - .pods().get({ qs: { labelSelector: selector } }); + const result = await pRetry(() => this.client + .api.v1.namespaces(namespace).pods() + .get({ qs: { labelSelector: selector } }), getRetryConfig()); let pod; if (typeof result !== 'undefined' && result) { @@ -115,8 +120,9 @@ class K8s { // eslint-disable-next-line no-constant-condition while (true) { - const result = await this.client.api.v1.namespaces(namespace) - .pods().get({ qs: { labelSelector: selector } }); + const result = await pRetry(() => this.client + .api.v1.namespaces(namespace).pods() + .get({ qs: { labelSelector: selector } }), getRetryConfig()); let podList; if (typeof result !== 'undefined' && result) { @@ -149,17 +155,21 @@ class K8s { try { if (objType === 'pods') { - response = await this.client.api.v1.namespaces(namespace) - .pods().get({ qs: { labelSelector: selector } }); + response = await pRetry(() => this.client + .api.v1.namespaces(namespace).pods() + .get({ qs: { labelSelector: selector } }), getRetryConfig()); } else if (objType === 'deployments') { - response = await this.client.apis.apps.v1.namespaces(namespace) - .deployments().get({ qs: { labelSelector: selector } }); + response = await pRetry(() => this.client + .apis.apps.v1.namespaces(namespace).deployments() + .get({ qs: { labelSelector: selector } }), getRetryConfig()); } else if (objType === 'services') { - response = await this.client.api.v1.namespaces(namespace) - .services().get({ qs: { labelSelector: selector } }); + response = await pRetry(() => this.client + .api.v1.namespaces(namespace).services() + .get({ qs: { labelSelector: selector } }), getRetryConfig()); } else if (objType === 'jobs') { - response = await this.client.apis.batch.v1.namespaces(namespace) - .jobs().get({ qs: { labelSelector: selector } }); + response = await pRetry(() => this.client + .apis.batch.v1.namespaces(namespace).jobs() + .get({ qs: { labelSelector: selector } }), getRetryConfig()); } else { const error = new Error(`Wrong objType provided to get: ${objType}`); this.logger.error(error); @@ -232,8 +242,9 @@ class K8s { let response; try { - response = await this.client.apis.apps.v1.namespaces(this.defaultNamespace) - .deployments(name).patch({ body: record }); + response = await pRetry(() => this.client + .apis.apps.v1.namespaces(this.defaultNamespace).deployments(name) + .patch({ body: record }), getRetryConfig()); } catch (e) { const err = new Error(`Request k8s.patch with ${name} failed with: ${e}`); this.logger.error(err); @@ -262,22 +273,25 @@ class K8s { try { if (objType === 'services') { - response = await this.client.api.v1.namespaces(this.defaultNamespace) - .services(name).delete(); + response = await pRetry(() => this.client + .api.v1.namespaces(this.defaultNamespace).services(name) + .delete(), getRetryConfig(), getRetryConfig()); } else if (objType === 'deployments') { - response = await this.client.apis.apps.v1.namespaces(this.defaultNamespace) - .deployments(name).delete(); + response = await pRetry(() => this.client + .apis.apps.v1.namespaces(this.defaultNamespace).deployments(name) + .delete(), getRetryConfig()); } else if (objType === 'jobs') { // To get a Job to remove the associated pods you have to // include a body like the one below with the delete request - response = await this.client.apis.batch.v1.namespaces(this.defaultNamespace) - .jobs(name).delete({ + response = await pRetry(() => this.client + .apis.batch.v1.namespaces(this.defaultNamespace).jobs(name) + .delete({ body: { apiVersion: 'v1', kind: 'DeleteOptions', propagationPolicy: 'Background' } - }); + }), getRetryConfig()); } else { throw new Error(`Invalid objType: ${objType}`); } @@ -314,20 +328,15 @@ class K8s { throw new Error('deleteExecution requires an executionId'); } - try { - this.logger.info(`Deleting worker deployment for ex_id: ${exId}`); - r.push(await this._deleteObjByExId(exId, 'worker', 'deployments')); - } catch (e) { - const err = new Error(`Error deleting workers: ${e}`); - this.logger.error(err); - return Promise.reject(err); - } + // FIXME: think more about this + this.logger.info(`Deleting worker deployment for ex_id: ${exId}`); + r.push(await this._deleteObjByExId(exId, 'worker', 'deployments')); await this.waitForNumPods( 0, `app.kubernetes.io/component=worker,teraslice.terascope.io/exId=${exId}`, null, - 600000 // 10 minutes, I wanted this to be longer than the pod shutdown timeout + this.shutdownTimeout + 15000 // shutdown_timeout + 15s ); try { diff --git a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/utils.js b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/utils.js index 8ad4caf891b..62d4f6b79a9 100644 --- a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/utils.js +++ b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/utils.js @@ -4,6 +4,8 @@ const fs = require('fs'); const path = require('path'); const barbe = require('barbe'); +const isTest = require('@terascope/utils'); + function makeTemplate(folder, fileName) { const filePath = path.join(__dirname, folder, `${fileName}.hbs`); const templateData = fs.readFileSync(filePath, 'utf-8'); @@ -38,4 +40,19 @@ function setMaxOldSpaceViaEnv(envArr, jobEnv, memory) { }); } -module.exports = { setMaxOldSpaceViaEnv, makeTemplate, getMaxOldSpace }; +const MAX_RETRIES = isTest ? 2 : 3; +const RETRY_DELAY = isTest ? 50 : 1000; // time in ms + +function getRetryConfig() { + return { + retries: MAX_RETRIES, + delay: RETRY_DELAY + }; +} + +module.exports = { + getMaxOldSpace, + getRetryConfig, + makeTemplate, + setMaxOldSpaceViaEnv +}; From 03ad3e93b15b888270cdf3863cd98c20b36c9a08 Mon Sep 17 00:00:00 2001 From: Austin Godber Date: Thu, 30 Jul 2020 18:22:41 -0700 Subject: [PATCH 07/11] Improve error handling during stop I also link the worker deployment to the execution controller job. This allows k8s to garbage collect the worker deployment (and then pods) when the execution controller is deleted. This is good in general but also critical to the error handling in deleteExecution closes #1612 --- .../kubernetes/deployments/worker.hbs | 12 ++++++++- .../cluster/backends/kubernetes/index.js | 11 +++++++- .../cluster/backends/kubernetes/k8s.js | 26 ++++++++++++------- .../backends/kubernetes/k8sResource.js | 2 ++ 4 files changed, 39 insertions(+), 12 deletions(-) diff --git a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/deployments/worker.hbs b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/deployments/worker.hbs index efbcca0e6a2..72249f654d7 100644 --- a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/deployments/worker.hbs +++ b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/deployments/worker.hbs @@ -11,7 +11,17 @@ "app.kubernetes.io/instance": "{{clusterNameLabel}}" }, "name": "{{name}}", - "namespace": "{{namespace}}" + "namespace": "{{namespace}}", + "ownerReferences": [ + { + "apiVersion": "batch/v1", + "controller": false, + "blockOwnerDeletion": false, + "kind": "Job", + "name": "{{exName}}", + "uid": "{{exUid}}" + } + ] }, "spec": { "replicas": {{replicas}}, diff --git a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/index.js b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/index.js index f4959650eb1..9fc51fee8ee 100644 --- a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/index.js +++ b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/index.js @@ -121,7 +121,16 @@ module.exports = function kubernetesClusterBackend(context, clusterMasterServer) * @param {Object} execution Object that contains information of Execution * @return {Promise} [description] */ - function allocateWorkers(execution) { + async function allocateWorkers(execution) { + // NOTE: I tried to set these on the execution inside allocateSlicer + // but these properties were gone by the time this was called, perhaps + // because they are not on the schema. So I do this k8s API call + // instead. + const selector = `app.kubernetes.io/component=execution_controller,teraslice.terascope.io/jobId=${execution.job_id}`; + const jobs = await k8s.list(selector, 'jobs'); + execution.k8sName = jobs.items[0].metadata.name; + execution.k8sUid = jobs.items[0].metadata.uid; + const kr = new K8sResource( 'deployments', 'worker', context.sysconfig.teraslice, execution ); diff --git a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js index bbd2c02fef2..469871298a3 100644 --- a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js +++ b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8s.js @@ -328,16 +328,22 @@ class K8s { throw new Error('deleteExecution requires an executionId'); } - // FIXME: think more about this - this.logger.info(`Deleting worker deployment for ex_id: ${exId}`); - r.push(await this._deleteObjByExId(exId, 'worker', 'deployments')); - - await this.waitForNumPods( - 0, - `app.kubernetes.io/component=worker,teraslice.terascope.io/exId=${exId}`, - null, - this.shutdownTimeout + 15000 // shutdown_timeout + 15s - ); + try { + this.logger.info(`Deleting worker deployment for ex_id: ${exId}`); + r.push(await this._deleteObjByExId(exId, 'worker', 'deployments')); + + await this.waitForNumPods( + 0, + `app.kubernetes.io/component=worker,teraslice.terascope.io/exId=${exId}`, + null, + this.shutdownTimeout + 15000 // shutdown_timeout + 15s + ); + } catch (e) { + // deliberately ignore errors, k8s will clean up workers when + // execution controller gets deleted. + const err = new Error(`Error encountered deleting pod deployment, continuing execution controller shutdown: ${e}`); + this.logger.error(err); + } try { this.logger.info(`Deleting execution controller job for ex_id: ${exId}`); diff --git a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8sResource.js b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8sResource.js index 17854c4b910..1a65bc57b09 100644 --- a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8sResource.js +++ b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/k8sResource.js @@ -96,6 +96,8 @@ class K8sResource { dockerImage, execution: safeEncode(this.execution), exId: this.execution.ex_id, + exName: this.execution.k8sName, + exUid: this.execution.k8sUid, jobId: this.execution.job_id, jobNameLabel, name, From 486b326c71d5e1a11bd2cb317193c2fe62f9bff9 Mon Sep 17 00:00:00 2001 From: Austin Godber Date: Fri, 31 Jul 2020 11:36:26 -0700 Subject: [PATCH 08/11] fix isTest --- .../lib/cluster/services/cluster/backends/kubernetes/utils.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/utils.js b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/utils.js index 62d4f6b79a9..9f7e53960fd 100644 --- a/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/utils.js +++ b/packages/teraslice/lib/cluster/services/cluster/backends/kubernetes/utils.js @@ -4,7 +4,7 @@ const fs = require('fs'); const path = require('path'); const barbe = require('barbe'); -const isTest = require('@terascope/utils'); +const { isTest } = require('@terascope/utils'); function makeTemplate(folder, fileName) { const filePath = path.join(__dirname, folder, `${fileName}.hbs`); From d87eb6ae2c91b89834b07d339985a68eed2ece52 Mon Sep 17 00:00:00 2001 From: Austin Godber Date: Fri, 31 Jul 2020 12:33:01 -0700 Subject: [PATCH 09/11] fix util spec --- .../cluster/backends/kubernetes/utils-spec.js | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/packages/teraslice/test/lib/cluster/services/cluster/backends/kubernetes/utils-spec.js b/packages/teraslice/test/lib/cluster/services/cluster/backends/kubernetes/utils-spec.js index 7ebc60494bf..dc5e898dec2 100644 --- a/packages/teraslice/test/lib/cluster/services/cluster/backends/kubernetes/utils-spec.js +++ b/packages/teraslice/test/lib/cluster/services/cluster/backends/kubernetes/utils-spec.js @@ -72,6 +72,8 @@ describe('K8s Utils', () => { jobNameLabel: 'example-job', clusterNameLabel: 'example-cluster', exId: 'some-ex-id', + exName: 'example-job-abcd', + exUid: 'UID1', jobId: 'some-job-id', nodeType: 'worker', namespace: 'some-namespace', @@ -93,7 +95,17 @@ describe('K8s Utils', () => { 'app.kubernetes.io/instance': config.clusterNameLabel }, name: config.name, - namespace: config.namespace + namespace: config.namespace, + ownerReferences: [ + { + apiVersion: 'batch/v1', + blockOwnerDeletion: false, + controller: false, + kind: 'Job', + name: 'example-job-abcd', + uid: 'UID1', + }, + ], }); expect(workerDeployment.spec.replicas).toEqual(config.replicas); From 3f15741e6ff660575323c8cda0ff34518d23928b Mon Sep 17 00:00:00 2001 From: Austin Godber Date: Fri, 31 Jul 2020 12:34:03 -0700 Subject: [PATCH 10/11] release: (patch) teraslice@0.69.1 --- examples/k8s/example-job-labels.json | 10 +++++++++- examples/k8s/example-job-resource.json | 10 +++++++++- examples/k8s/example-job-targets.json | 10 +++++++++- examples/k8s/example-job-volume.json | 10 +++++++++- examples/k8s/example-job.json | 10 +++++++++- package.json | 2 +- packages/teraslice/package.json | 2 +- 7 files changed, 47 insertions(+), 7 deletions(-) diff --git a/examples/k8s/example-job-labels.json b/examples/k8s/example-job-labels.json index 0c9e6dd9d56..caa698d364e 100644 --- a/examples/k8s/example-job-labels.json +++ b/examples/k8s/example-job-labels.json @@ -26,5 +26,13 @@ "_op": "elasticsearch_bulk", "size": 5000 } - ] + ], + "__metadata": { + "cli": { + "cluster": "http://192.168.39.185:30678", + "version": "0.28.2", + "job_id": "e5d44c51-c6e9-4079-9548-647ace7f5045", + "updated": "2020-07-31T18:18:00.156Z" + } + } } diff --git a/examples/k8s/example-job-resource.json b/examples/k8s/example-job-resource.json index 5ac464bbda1..15b01d388f0 100644 --- a/examples/k8s/example-job-resource.json +++ b/examples/k8s/example-job-resource.json @@ -25,5 +25,13 @@ "_op": "elasticsearch_bulk", "size": 5000 } - ] + ], + "__metadata": { + "cli": { + "cluster": "http://192.168.39.185:30678", + "version": "0.28.2", + "job_id": "9ab6e845-7465-44a9-b4a2-7e29e00789f6", + "updated": "2020-07-31T18:18:01.362Z" + } + } } diff --git a/examples/k8s/example-job-targets.json b/examples/k8s/example-job-targets.json index 57f0098c03b..33f61d8da26 100644 --- a/examples/k8s/example-job-targets.json +++ b/examples/k8s/example-job-targets.json @@ -29,5 +29,13 @@ "_op": "elasticsearch_bulk", "size": 5000 } - ] + ], + "__metadata": { + "cli": { + "cluster": "http://192.168.39.185:30678", + "version": "0.28.2", + "job_id": "52fc2d6f-26c2-4953-9a17-9293ea7a2476", + "updated": "2020-07-31T18:18:02.609Z" + } + } } diff --git a/examples/k8s/example-job-volume.json b/examples/k8s/example-job-volume.json index 7c2a5a08e26..f7cdc6ae77c 100644 --- a/examples/k8s/example-job-volume.json +++ b/examples/k8s/example-job-volume.json @@ -29,5 +29,13 @@ "_op": "elasticsearch_bulk", "size": 5000 } - ] + ], + "__metadata": { + "cli": { + "cluster": "http://192.168.39.185:30678", + "version": "0.28.2", + "job_id": "28e59db6-ccf2-4521-8e5d-3a66d3abe49b", + "updated": "2020-07-31T18:18:03.835Z" + } + } } diff --git a/examples/k8s/example-job.json b/examples/k8s/example-job.json index 349636dc0d8..dbecce7ddb1 100644 --- a/examples/k8s/example-job.json +++ b/examples/k8s/example-job.json @@ -27,5 +27,13 @@ "_op": "elasticsearch_bulk", "size": 5000 } - ] + ], + "__metadata": { + "cli": { + "cluster": "http://192.168.39.185:30678", + "version": "0.28.2", + "job_id": "e5f49c7d-05b3-43b0-96e0-4802c77b1099", + "updated": "2020-07-31T18:17:58.957Z" + } + } } diff --git a/package.json b/package.json index e95c39ea82c..9bd2478cf37 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "teraslice-workspace", "displayName": "Teraslice", - "version": "0.69.0", + "version": "0.69.1", "private": true, "homepage": "https://github.com/terascope/teraslice", "bugs": { diff --git a/packages/teraslice/package.json b/packages/teraslice/package.json index 5f487458ea3..8399b0c2d2c 100644 --- a/packages/teraslice/package.json +++ b/packages/teraslice/package.json @@ -1,7 +1,7 @@ { "name": "teraslice", "displayName": "Teraslice", - "version": "0.69.0", + "version": "0.69.1", "description": "Distributed computing platform for processing JSON data", "homepage": "https://github.com/terascope/teraslice#readme", "bugs": { From fa4fa66af6553194f5ad1ba0cc4813b127c33984 Mon Sep 17 00:00:00 2001 From: Austin Godber Date: Fri, 31 Jul 2020 12:37:28 -0700 Subject: [PATCH 11/11] reverting accidental job commit --- examples/k8s/example-job-labels.json | 10 +--------- examples/k8s/example-job-resource.json | 10 +--------- examples/k8s/example-job-targets.json | 10 +--------- examples/k8s/example-job-volume.json | 10 +--------- examples/k8s/example-job.json | 10 +--------- 5 files changed, 5 insertions(+), 45 deletions(-) diff --git a/examples/k8s/example-job-labels.json b/examples/k8s/example-job-labels.json index caa698d364e..0c9e6dd9d56 100644 --- a/examples/k8s/example-job-labels.json +++ b/examples/k8s/example-job-labels.json @@ -26,13 +26,5 @@ "_op": "elasticsearch_bulk", "size": 5000 } - ], - "__metadata": { - "cli": { - "cluster": "http://192.168.39.185:30678", - "version": "0.28.2", - "job_id": "e5d44c51-c6e9-4079-9548-647ace7f5045", - "updated": "2020-07-31T18:18:00.156Z" - } - } + ] } diff --git a/examples/k8s/example-job-resource.json b/examples/k8s/example-job-resource.json index 15b01d388f0..5ac464bbda1 100644 --- a/examples/k8s/example-job-resource.json +++ b/examples/k8s/example-job-resource.json @@ -25,13 +25,5 @@ "_op": "elasticsearch_bulk", "size": 5000 } - ], - "__metadata": { - "cli": { - "cluster": "http://192.168.39.185:30678", - "version": "0.28.2", - "job_id": "9ab6e845-7465-44a9-b4a2-7e29e00789f6", - "updated": "2020-07-31T18:18:01.362Z" - } - } + ] } diff --git a/examples/k8s/example-job-targets.json b/examples/k8s/example-job-targets.json index 33f61d8da26..57f0098c03b 100644 --- a/examples/k8s/example-job-targets.json +++ b/examples/k8s/example-job-targets.json @@ -29,13 +29,5 @@ "_op": "elasticsearch_bulk", "size": 5000 } - ], - "__metadata": { - "cli": { - "cluster": "http://192.168.39.185:30678", - "version": "0.28.2", - "job_id": "52fc2d6f-26c2-4953-9a17-9293ea7a2476", - "updated": "2020-07-31T18:18:02.609Z" - } - } + ] } diff --git a/examples/k8s/example-job-volume.json b/examples/k8s/example-job-volume.json index f7cdc6ae77c..7c2a5a08e26 100644 --- a/examples/k8s/example-job-volume.json +++ b/examples/k8s/example-job-volume.json @@ -29,13 +29,5 @@ "_op": "elasticsearch_bulk", "size": 5000 } - ], - "__metadata": { - "cli": { - "cluster": "http://192.168.39.185:30678", - "version": "0.28.2", - "job_id": "28e59db6-ccf2-4521-8e5d-3a66d3abe49b", - "updated": "2020-07-31T18:18:03.835Z" - } - } + ] } diff --git a/examples/k8s/example-job.json b/examples/k8s/example-job.json index dbecce7ddb1..349636dc0d8 100644 --- a/examples/k8s/example-job.json +++ b/examples/k8s/example-job.json @@ -27,13 +27,5 @@ "_op": "elasticsearch_bulk", "size": 5000 } - ], - "__metadata": { - "cli": { - "cluster": "http://192.168.39.185:30678", - "version": "0.28.2", - "job_id": "e5f49c7d-05b3-43b0-96e0-4802c77b1099", - "updated": "2020-07-31T18:17:58.957Z" - } - } + ] }