From 99ad1f2e15a48a9c92cb7d4bd45f1e903f6096be Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Mon, 29 Apr 2019 20:22:16 -0400 Subject: [PATCH 01/17] [JENKINS-49707] If a KubernetesComputer disconnects, remove the KubernetesSlave so that DurableTaskStep knows to clean up. --- pom.xml | 13 +++++++------ .../plugins/kubernetes/KubernetesLauncher.java | 18 ++++++++++++++++++ .../pipeline/KubernetesPipelineTest.java | 11 +++++++++++ .../kubernetes/pipeline/terminatedPod.groovy | 9 +++++++++ 4 files changed, 45 insertions(+), 6 deletions(-) create mode 100644 src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy diff --git a/pom.xml b/pom.xml index b1b66ccc45..5bb14714cd 100644 --- a/pom.xml +++ b/pom.xml @@ -45,10 +45,11 @@ <connectorHost /> <jenkins.host.address /> <java.level>8</java.level> - <jenkins.version>2.138.4</jenkins.version> + <jenkins.version>2.150.1</jenkins.version> <no-test-jar>false</no-test-jar> <surefire.rerunFailingTestsCount>1</surefire.rerunFailingTestsCount> <pipeline-model-definition.version>1.3.7</pipeline-model-definition.version> + <workflow-support-plugin.version>3.1</workflow-support-plugin.version> </properties> <dependencies> @@ -94,7 +95,7 @@ <dependency> <!-- OnceRetentionStrategy --> <groupId>org.jenkins-ci.plugins</groupId> <artifactId>durable-task</artifactId> - <version>1.28</version> + <version>1.29</version> </dependency> <dependency> <groupId>org.jenkins-ci.plugins</groupId> @@ -104,7 +105,7 @@ <dependency> <groupId>org.jenkins-ci.plugins.workflow</groupId> <artifactId>workflow-step-api</artifactId> - <version>2.17</version> + <version>2.18</version> </dependency> <dependency> <!-- DeclarativeAgent --> <groupId>org.jenkinsci.plugins</groupId> @@ -136,19 +137,19 @@ <dependency> <groupId>org.jenkins-ci.plugins.workflow</groupId> <artifactId>workflow-support</artifactId> - <version>3.0</version> + <version>${workflow-support-plugin.version}</version> <scope>test</scope> </dependency> <dependency> <groupId>org.jenkins-ci.plugins.workflow</groupId> <artifactId>workflow-durable-task-step</artifactId> - <version>2.27</version> + <version>2.31-SNAPSHOT</version> <!-- https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/104 --> <scope>test</scope> </dependency> <dependency> <!-- SemaphoreStep --> <groupId>org.jenkins-ci.plugins.workflow</groupId> <artifactId>workflow-support</artifactId> - <version>3.0</version> + <version>${workflow-support-plugin.version}</version> <classifier>tests</classifier> <scope>test</scope> </dependency> diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesLauncher.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesLauncher.java index 9e0e7f6936..1365c4514a 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesLauncher.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesLauncher.java @@ -39,6 +39,7 @@ import org.kohsuke.stapler.DataBoundConstructor; import com.google.common.base.Throwables; +import hudson.model.Slave; import hudson.model.TaskListener; import hudson.slaves.JNLPLauncher; @@ -46,6 +47,7 @@ import io.fabric8.kubernetes.api.model.Pod; import io.fabric8.kubernetes.client.KubernetesClient; +import jenkins.model.Jenkins; /** * Launches on Kubernetes the specified {@link KubernetesComputer} instance. @@ -159,4 +161,20 @@ public AllContainersRunningPodWatcher getWatcher() { return watcher; } + @Override + public void afterDisconnect(SlaveComputer computer, TaskListener listener) { + // TODO can this be done only if the pod is dead? + Slave node = computer.getNode(); + if (node != null) { + try { + Jenkins.get().removeNode(node); + LOGGER.info(() -> "Removed disconnected agent " + node.getNodeName()); + } catch (IOException x) { + LOGGER.log(Level.WARNING, x, () -> "Unable to remove Jenkins node " + node.getNodeName()); + } + } else { + LOGGER.fine(() -> "Could not find disconnected agent " + computer.getName() + " to remove"); + } + } + } diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java index 3305465a92..3cb190b461 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java @@ -50,6 +50,7 @@ import org.jvnet.hudson.test.JenkinsRuleNonLocalhost; import hudson.model.Result; +import org.jvnet.hudson.test.Issue; /** * @author Carlos Sanchez @@ -334,4 +335,14 @@ public void runInPodWithRetention() throws Exception { r.assertBuildStatusSuccess(r.waitForCompletion(b)); assertTrue(deletePods(cloud.connect(), getLabels(this, name), true)); } + + @Issue("JENKINS-49707") + @Test + public void terminatedPod() throws Exception { + r.waitForMessage("+ sleep", b); + deletePods(cloud.connect(), getLabels(this, name), false); + r.assertBuildStatus(Result.FAILURE, r.waitForCompletion(b)); + r.assertLogContains(" was removed", b); + } + } diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy new file mode 100644 index 0000000000..6789d6127e --- /dev/null +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy @@ -0,0 +1,9 @@ +podTemplate(label: 'runInPod', containers: [ + containerTemplate(name: 'busybox', image: 'busybox', ttyEnabled: true, command: '/bin/cat'), + ]) { + node ('runInPod') { + container('busybox') { + sh 'sleep 9999999' + } + } +} From cae2572a63ad059c420d2f90965355b53fd7677c Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Mon, 29 Apr 2019 21:39:55 -0400 Subject: [PATCH 02/17] workflow-step-api-plugin.version --- pom.xml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 5bb14714cd..ca658eea31 100644 --- a/pom.xml +++ b/pom.xml @@ -50,6 +50,7 @@ <surefire.rerunFailingTestsCount>1</surefire.rerunFailingTestsCount> <pipeline-model-definition.version>1.3.7</pipeline-model-definition.version> <workflow-support-plugin.version>3.1</workflow-support-plugin.version> + <workflow-step-api-plugin.version>2.18</workflow-step-api-plugin.version> </properties> <dependencies> @@ -105,7 +106,7 @@ <dependency> <groupId>org.jenkins-ci.plugins.workflow</groupId> <artifactId>workflow-step-api</artifactId> - <version>2.18</version> + <version>${workflow-step-api-plugin.version}</version> </dependency> <dependency> <!-- DeclarativeAgent --> <groupId>org.jenkinsci.plugins</groupId> @@ -130,7 +131,7 @@ <dependency> <!-- StepConfigTester --> <groupId>org.jenkins-ci.plugins.workflow</groupId> <artifactId>workflow-step-api</artifactId> - <version>2.17</version> + <version>${workflow-step-api-plugin.version}</version> <classifier>tests</classifier> <scope>test</scope> </dependency> From eed014f57d1f84b9cc7297614483c0b347c255c2 Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Mon, 29 Apr 2019 21:41:35 -0400 Subject: [PATCH 03/17] Updated to https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/104/commits/58b127fa34cd10bfc76659b86292832965fe7c95. --- .../plugins/kubernetes/pipeline/KubernetesPipelineTest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java index 3cb190b461..1255dcfa4d 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java @@ -50,6 +50,7 @@ import org.jvnet.hudson.test.JenkinsRuleNonLocalhost; import hudson.model.Result; +import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution; import org.jvnet.hudson.test.Issue; /** @@ -341,8 +342,8 @@ public void runInPodWithRetention() throws Exception { public void terminatedPod() throws Exception { r.waitForMessage("+ sleep", b); deletePods(cloud.connect(), getLabels(this, name), false); - r.assertBuildStatus(Result.FAILURE, r.waitForCompletion(b)); - r.assertLogContains(" was removed", b); + r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); + r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b); } } From c3c98a1ee17ad3a10d42bd939a07127c59767e8c Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Tue, 30 Apr 2019 17:25:09 -0400 Subject: [PATCH 04/17] Incremental deployment. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index ca658eea31..e8e1fd4b9f 100644 --- a/pom.xml +++ b/pom.xml @@ -144,7 +144,7 @@ <dependency> <groupId>org.jenkins-ci.plugins.workflow</groupId> <artifactId>workflow-durable-task-step</artifactId> - <version>2.31-SNAPSHOT</version> <!-- https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/104 --> + <version>2.31-rc898.994233111902</version> <!-- TODO https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/104 --> <scope>test</scope> </dependency> <dependency> <!-- SemaphoreStep --> From 4510310f594a2962c105a931de0c7893b97b3d02 Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Tue, 30 Apr 2019 17:41:19 -0400 Subject: [PATCH 05/17] JDK 11 Javadoc failure. --- pom.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index e8e1fd4b9f..6a6d90ca2f 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,8 @@ <parent> <groupId>org.jenkins-ci.plugins</groupId> <artifactId>plugin</artifactId> - <version>3.42</version> + <version>3.43</version> + <relativePath /> </parent> <groupId>org.csanchez.jenkins.plugins</groupId> From 8569e835bce42d815e6c3fcc0c47531fdc203c0f Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Wed, 1 May 2019 09:06:58 -0400 Subject: [PATCH 06/17] =?UTF-8?q?Setting=20surefire.rerunFailingTestsCount?= =?UTF-8?q?=20to=200=20not=201.=20while=20this=20plugin=20had=20tried=20to?= =?UTF-8?q?=20turn=20off=20the=20rerun=20feature,=20what=20it=20actually?= =?UTF-8?q?=20did=20was=20make=20it=20so=20that=20failing=20tests=20are=20?= =?UTF-8?q?rerun=20once=E2=80=94i.e.,=20run=20twice=20before=20finally=20f?= =?UTF-8?q?ailing.=20This=20happens=20even=20when=20-Dtest=3D=E2=80=A6=20i?= =?UTF-8?q?s=20specified,=20which=20disables=20the=20profile=20in=20the=20?= =?UTF-8?q?parent=20POM,=20annoyingly=20making=20all=20failing=20local=20t?= =?UTF-8?q?ests=20run=20twice.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 6a6d90ca2f..9dc52fff4f 100644 --- a/pom.xml +++ b/pom.xml @@ -48,7 +48,7 @@ <java.level>8</java.level> <jenkins.version>2.150.1</jenkins.version> <no-test-jar>false</no-test-jar> - <surefire.rerunFailingTestsCount>1</surefire.rerunFailingTestsCount> + <surefire.rerunFailingTestsCount>0</surefire.rerunFailingTestsCount> <pipeline-model-definition.version>1.3.7</pipeline-model-definition.version> <workflow-support-plugin.version>3.1</workflow-support-plugin.version> <workflow-step-api-plugin.version>2.18</workflow-step-api-plugin.version> From f11a18226d8dd95ab695f6a1ac38dd49e2aa59b3 Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Wed, 1 May 2019 09:08:36 -0400 Subject: [PATCH 07/17] Copy-pasta. --- .../jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy index 6789d6127e..8e54c40221 100644 --- a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy @@ -1,7 +1,7 @@ -podTemplate(label: 'runInPod', containers: [ +podTemplate(label: 'terminatedPod', containers: [ containerTemplate(name: 'busybox', image: 'busybox', ttyEnabled: true, command: '/bin/cat'), ]) { - node ('runInPod') { + node ('terminatedPod') { container('busybox') { sh 'sleep 9999999' } From ce032c60f93fd9078e9780489400387f3331073f Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Wed, 1 May 2019 09:42:59 -0400 Subject: [PATCH 08/17] Redesigned fix to remove agents when a pod is deleted, rather than merely due to a Remoting channel being disconnected. Among other things, this fixes failures in RestartPipelineTest. (Perhaps attributable to org.jvnet.hudson.test.ChannelShutdownListener rather than production behavior, but still.) --- .../kubernetes/KubernetesLauncher.java | 18 --- .../kubernetes/pod/retention/Reaper.java | 137 ++++++++++++++++++ .../pipeline/RestartPipelineTest.java | 35 +++++ .../pipeline/terminatedPodAfterRestart.groovy | 11 ++ 4 files changed, 183 insertions(+), 18 deletions(-) create mode 100644 src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java create mode 100644 src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesLauncher.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesLauncher.java index 1365c4514a..9e0e7f6936 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesLauncher.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesLauncher.java @@ -39,7 +39,6 @@ import org.kohsuke.stapler.DataBoundConstructor; import com.google.common.base.Throwables; -import hudson.model.Slave; import hudson.model.TaskListener; import hudson.slaves.JNLPLauncher; @@ -47,7 +46,6 @@ import io.fabric8.kubernetes.api.model.Pod; import io.fabric8.kubernetes.client.KubernetesClient; -import jenkins.model.Jenkins; /** * Launches on Kubernetes the specified {@link KubernetesComputer} instance. @@ -161,20 +159,4 @@ public AllContainersRunningPodWatcher getWatcher() { return watcher; } - @Override - public void afterDisconnect(SlaveComputer computer, TaskListener listener) { - // TODO can this be done only if the pod is dead? - Slave node = computer.getNode(); - if (node != null) { - try { - Jenkins.get().removeNode(node); - LOGGER.info(() -> "Removed disconnected agent " + node.getNodeName()); - } catch (IOException x) { - LOGGER.log(Level.WARNING, x, () -> "Unable to remove Jenkins node " + node.getNodeName()); - } - } else { - LOGGER.fine(() -> "Could not find disconnected agent " + computer.getName() + " to remove"); - } - } - } diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java new file mode 100644 index 0000000000..15277a170c --- /dev/null +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java @@ -0,0 +1,137 @@ +/* + * Copyright 2019 CloudBees, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.csanchez.jenkins.plugins.kubernetes.pod.retention; + +import hudson.Extension; +import hudson.model.Computer; +import hudson.model.Node; +import hudson.model.TaskListener; +import hudson.slaves.Cloud; +import hudson.slaves.ComputerListener; +import hudson.slaves.EphemeralNode; +import io.fabric8.kubernetes.api.model.Pod; +import io.fabric8.kubernetes.client.KubernetesClientException; +import io.fabric8.kubernetes.client.Watcher; +import java.io.IOException; +import java.util.ArrayList; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.logging.Level; +import java.util.logging.Logger; +import jenkins.model.Jenkins; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesComputer; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave; + +/** + * Checks for deleted pods corresponding to {@link KubernetesSlave} and ensures the node is removed from Jenkins too. + * <p>If the pod has been deleted, all of the associated state (running user processes, workspace, etc.) must also be gone; + * so there is no point in retaining this agent definition any further. + * ({@link KubernetesSlave} is not an {@link EphemeralNode}: it <em>does</em> support running across Jenkins restarts.) + * <p>Note that pod retention policies other than the default {@link Never} may disable this system, + * unless some external process or garbage collection policy results in pod deletion. + */ +@Extension +public class Reaper extends ComputerListener implements Watcher<Pod> { + + private static final Logger LOGGER = Logger.getLogger(Reaper.class.getName()); + + /** + * Activate this feature only if and when some Kubernetes agent is actually used. + * Avoids touching the API server when this plugin is not even in use. + */ + private final AtomicBoolean activated = new AtomicBoolean(); + + @Override + public void onOnline(Computer c, TaskListener listener) throws IOException, InterruptedException { + if (c instanceof KubernetesComputer && activated.compareAndSet(false, true)) { + activate(); + } + } + + private void activate() { + LOGGER.fine("Activating reaper"); + // First check all existing nodes to see if they still have active pods. + // (We may have missed deletion events while Jenkins was shut off, + // or pods may have been deleted before any Kubernetes agent was brought online.) + for (Node n : new ArrayList<>(Jenkins.get().getNodes())) { + if (!(n instanceof KubernetesSlave)) { + continue; + } + KubernetesSlave ks = (KubernetesSlave) n; + String ns = ks.getNamespace(); + String name = ks.getPodName(); + try { + // TODO more efficient to do a single (or paged) list request, but tricky since there may be multiple clouds, + // and even within a single cloud an agent pod is permitted to use a nondefault namespace + if (ks.getKubernetesCloud().connect().pods().inNamespace(ns).withName(name).get() == null) { + LOGGER.info(() -> ns + "/" + name + " seems to have been deleted, so removing corresponding Jenkins agent"); + Jenkins.get().removeNode(ks); + } else { + LOGGER.fine(() -> ns + "/" + name + " still seems to exist, OK"); + } + } catch (Exception x) { + LOGGER.log(Level.WARNING, "failed to do initial reap check for " + ns + "/" + name, x); + } + } + // Now set up a watch for any subsequent pod deletions. + for (Cloud c : Jenkins.get().clouds) { + if (!(c instanceof KubernetesCloud)) { + continue; + } + KubernetesCloud kc = (KubernetesCloud) c; + try { + kc.connect().pods().inAnyNamespace().watch(this); + } catch (Exception x) { + LOGGER.log(Level.WARNING, "failed to set up watcher on " + kc.getDisplayName(), x); + } + } + } + + @Override + public void eventReceived(Watcher.Action action, Pod pod) { + if (action == Watcher.Action.DELETED) { + String ns = pod.getMetadata().getNamespace(); + String name = pod.getMetadata().getName(); + for (Node n : new ArrayList<>(Jenkins.get().getNodes())) { + if (!(n instanceof KubernetesSlave)) { + continue; + } + KubernetesSlave ks = (KubernetesSlave) n; + if (ks.getNamespace().equals(ns) && ks.getPodName().equals(name)) { + LOGGER.info(() -> ns + "/" + name + " was just deleted, so removing corresponding Jenkins agent"); + try { + Jenkins.get().removeNode(ks); + return; + } catch (Exception x) { + LOGGER.log(Level.WARNING, "failed to reap " + ns + "/" + name, x); + } + } + } + LOGGER.fine(() -> "received deletion notice for " + ns + "/" + name + " which does not seem to correspond to any Jenkins agent"); + } + } + + @Override + public void onClose(KubernetesClientException cause) { + // TODO ignore, or do we need to manually reattach the watcher? + // AllContainersRunningPodWatcher is not reattached, but this is expected to be short-lived, + // useful only until the containers of a single pod start running. + // (At least when using kubernetes-client/java, the connection gets closed after 2m on GKE + // and you need to rerun the watch. Does the fabric8io client wrap this?) + } + +} diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java index 40314eec22..a892f19581 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java @@ -56,11 +56,15 @@ import org.jvnet.hudson.test.RestartableJenkinsNonLocalhostRule; import hudson.model.Node; +import hudson.model.Result; import hudson.slaves.DumbSlave; import hudson.slaves.JNLPLauncher; import hudson.slaves.NodeProperty; import hudson.slaves.RetentionStrategy; import jenkins.model.JenkinsLocationConfiguration; +import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution; +import org.jvnet.hudson.test.Issue; +import org.jvnet.hudson.test.JenkinsRule; public class RestartPipelineTest { protected static final String CONTAINER_ENV_VAR_VALUE = "container-env-var-value"; @@ -187,6 +191,37 @@ public void runInPodWithRestart() throws Exception { }); } + @Issue("JENKINS-49707") + @Test + public void terminatedPodAfterRestart() throws Exception { + story.then(r -> { + configureCloud(); + WorkflowJob p = r.jenkins.createProject(WorkflowJob.class, "p"); + p.setDefinition(new CpsFlowDefinition(loadPipelineScript("terminatedPodAfterRestart.groovy"), true)); + WorkflowRun b = p.scheduleBuild2(0).waitForStart(); + r.waitForMessage("+ sleep", b); + }); + story.then(r -> { + WorkflowRun b = r.jenkins.getItemByFullName("p", WorkflowJob.class).getBuildByNumber(1); + r.waitForMessage("Ready to run", b); + // Note that the test is cheating here slightly. + // The watch in Reaper is still running across the in-JVM restarts, + // whereas in production it would have been cancelled during the shutdown. + // But it does not matter since we are waiting for the agent to come back online after the restart, + // which is sufficient trigger to reactivate the reaper. + // Indeed we get two Reaper instances running, which independently remove the node. + deletePods(cloud.connect(), getLabels(this, name), false); + r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); + while (!JenkinsRule.getLog(b).contains(new ExecutorStepExecution.RemovedNodeCause().getShortDescription())) { + // TODO JenkinsRule.waitForMessage has a race condition w.r.t. the termination cause printed by WorkflowRun.finish + Thread.sleep(100); + } + // Currently the logic in ExecutorStepExecution cannot handle a Jenkins restart so it prints the following. + // It does not matter since DurableTaskStep redundantly implements the same check. + r.assertLogContains(" was deleted, but do not have a node body to cancel", b); + }); + } + @Test public void getContainerLogWithRestart() throws Exception { story.then(r -> { diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy new file mode 100644 index 0000000000..c87a0ec733 --- /dev/null +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy @@ -0,0 +1,11 @@ +package org.csanchez.jenkins.plugins.kubernetes.pipeline + +podTemplate(label: 'terminatedPodAfterRestart', containers: [ + containerTemplate(name: 'busybox', image: 'busybox', ttyEnabled: true, command: '/bin/cat'), +]) { + node ('terminatedPodAfterRestart') { + container('busybox') { + sh 'sleep 9999999' + } + } +} From 61307bfa0a667915f2dee4a3166bdd5742fc790e Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Wed, 1 May 2019 18:34:07 -0400 Subject: [PATCH 09/17] We will not in general be permitted to watch pods at cluster scope, so do not even try. --- .../jenkins/plugins/kubernetes/pod/retention/Reaper.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java index 15277a170c..8621731706 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java @@ -24,6 +24,7 @@ import hudson.slaves.ComputerListener; import hudson.slaves.EphemeralNode; import io.fabric8.kubernetes.api.model.Pod; +import io.fabric8.kubernetes.client.KubernetesClient; import io.fabric8.kubernetes.client.KubernetesClientException; import io.fabric8.kubernetes.client.Watcher; import java.io.IOException; @@ -76,7 +77,10 @@ private void activate() { String name = ks.getPodName(); try { // TODO more efficient to do a single (or paged) list request, but tricky since there may be multiple clouds, - // and even within a single cloud an agent pod is permitted to use a nondefault namespace + // and even within a single cloud an agent pod is permitted to use a nondefault namespace, + // yet we do not want to do an unnamespaced pod list for RBAC reasons. + // Could use a hybrid approach: first list all pods in the configured namespace for all clouds; + // then go back and individually check any unmatched agents with their configured namespace. if (ks.getKubernetesCloud().connect().pods().inNamespace(ns).withName(name).get() == null) { LOGGER.info(() -> ns + "/" + name + " seems to have been deleted, so removing corresponding Jenkins agent"); Jenkins.get().removeNode(ks); @@ -94,7 +98,8 @@ private void activate() { } KubernetesCloud kc = (KubernetesCloud) c; try { - kc.connect().pods().inAnyNamespace().watch(this); + KubernetesClient client = kc.connect(); + client.pods().inNamespace(client.getNamespace()).watch(this); } catch (Exception x) { LOGGER.log(Level.WARNING, "failed to set up watcher on " + kc.getDisplayName(), x); } From 2c3b39e41599c684916b10f2c76c7dc500202e44 Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Wed, 5 Jun 2019 13:46:43 -0400 Subject: [PATCH 10/17] Bump. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index ca3d70f04b..6db9be6126 100644 --- a/pom.xml +++ b/pom.xml @@ -145,7 +145,7 @@ <dependency> <groupId>org.jenkins-ci.plugins.workflow</groupId> <artifactId>workflow-durable-task-step</artifactId> - <version>2.31-rc898.994233111902</version> <!-- TODO https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/104 --> + <version>2.32-rc917.b70f8edaf896</version> <!-- TODO https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/104 --> <scope>test</scope> </dependency> <dependency> <!-- SemaphoreStep --> From 5b30251b59e6d14d9d3997479ce200bf9615267f Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Wed, 5 Jun 2019 15:24:06 -0400 Subject: [PATCH 11/17] RequireUpperBoundDeps --- pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 6db9be6126..c4b8d3008b 100644 --- a/pom.xml +++ b/pom.xml @@ -50,8 +50,8 @@ <no-test-jar>false</no-test-jar> <surefire.rerunFailingTestsCount>0</surefire.rerunFailingTestsCount> <pipeline-model-definition.version>1.3.7</pipeline-model-definition.version> - <workflow-support-plugin.version>3.1</workflow-support-plugin.version> - <workflow-step-api-plugin.version>2.18</workflow-step-api-plugin.version> + <workflow-support-plugin.version>3.3</workflow-support-plugin.version> + <workflow-step-api-plugin.version>2.20</workflow-step-api-plugin.version> </properties> <dependencies> @@ -248,7 +248,7 @@ <dependency> <groupId>org.jenkins-ci.plugins</groupId> <artifactId>script-security</artifactId> - <version>1.50</version> + <version>1.58</version> </dependency> <dependency> <groupId>org.jenkins-ci.plugins</groupId> @@ -263,7 +263,7 @@ <dependency> <groupId>org.jenkins-ci.plugins</groupId> <artifactId>structs</artifactId> - <version>1.17</version> + <version>1.18</version> </dependency> <dependency> <groupId>org.jenkins-ci.plugins</groupId> From 3f00eb155773cf41d630f11467c881a54a307dc2 Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Wed, 5 Jun 2019 19:39:18 -0400 Subject: [PATCH 12/17] Need to update workflow-cps to interpret DynamicContext. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index c4b8d3008b..6c9a66a9ed 100644 --- a/pom.xml +++ b/pom.xml @@ -182,7 +182,7 @@ <dependency> <groupId>org.jenkins-ci.plugins.workflow</groupId> <artifactId>workflow-cps</artifactId> - <version>2.61.1</version> + <version>2.70</version> <scope>test</scope> </dependency> <dependency> From 210e0f724f0a9d0f6d13a35baea03d5dc010d0d7 Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Tue, 11 Jun 2019 16:55:59 -0400 Subject: [PATCH 13/17] Test flake pending #496. --- .../plugins/kubernetes/pipeline/KubernetesPipelineTest.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java index e647781c12..762989c66b 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java @@ -51,6 +51,7 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; import org.jvnet.hudson.test.Issue; +import org.jvnet.hudson.test.JenkinsRule; import org.jvnet.hudson.test.JenkinsRuleNonLocalhost; /** @@ -362,7 +363,10 @@ public void terminatedPod() throws Exception { r.waitForMessage("+ sleep", b); deletePods(cloud.connect(), getLabels(this, name), false); r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); - r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b); + // TODO could use waitForMessage after #496 + while (!JenkinsRule.getLog(b).contains(new ExecutorStepExecution.RemovedNodeCause().getShortDescription())) { + Thread.sleep(100); + } } } From a25d24f7cc8f73b33fd2b91f87d0be2de534d5dd Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Tue, 11 Jun 2019 17:05:15 -0400 Subject: [PATCH 14/17] Bump. --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 9ed49d20e1..90b5c4f604 100644 --- a/pom.xml +++ b/pom.xml @@ -46,7 +46,7 @@ <connectorHost /> <jenkins.host.address /> <java.level>8</java.level> - <jenkins.version>2.150.1</jenkins.version> + <jenkins.version>2.176.1</jenkins.version> <no-test-jar>false</no-test-jar> <useBeta>true</useBeta> <surefire.rerunFailingTestsCount>0</surefire.rerunFailingTestsCount> @@ -152,7 +152,7 @@ <dependency> <groupId>org.jenkins-ci.plugins.workflow</groupId> <artifactId>workflow-durable-task-step</artifactId> - <version>2.32-rc917.b70f8edaf896</version> <!-- TODO https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/104 --> + <version>2.32-rc934.23119201f0b5</version> <!-- TODO https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/104 --> <scope>test</scope> </dependency> <dependency> <!-- SemaphoreStep --> From e71a5c5950ee976da51bf3046775ff2b22e4c170 Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Tue, 11 Jun 2019 17:08:18 -0400 Subject: [PATCH 15/17] Senseless to even try to build against _older_ LTS lines than our minimum. Unfortunately I lack write access to the repo so this Jenkinsfile change will be ignored. --- Jenkinsfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index b3e75c465c..3063ec25ed 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1 +1,4 @@ -buildPlugin(configurations: buildPlugin.recommendedConfigurations().findAll { it.platform == 'linux' }) +buildPlugin(configurations: [ + [platform: 'linux', jdk: '8', jenkins: null], + [platform: 'linux', jdk: '11', jenkins: null], +]) From 70153a9224217001bfc65d9a8fac4a86405786ac Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Fri, 5 Jul 2019 15:43:05 -0400 Subject: [PATCH 16/17] workflow-durable-task-step 2.32 Co-Authored-By: Devin Nusbaum <dwnusbaum@users.noreply.github.com> --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index b93e640e23..cf10a862cf 100644 --- a/pom.xml +++ b/pom.xml @@ -152,7 +152,7 @@ <dependency> <groupId>org.jenkins-ci.plugins.workflow</groupId> <artifactId>workflow-durable-task-step</artifactId> - <version>2.32-rc934.23119201f0b5</version> <!-- TODO https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/104 --> + <version>2.32</version> <scope>test</scope> </dependency> <dependency> <!-- SemaphoreStep --> From bb4e2978fe5ffffd5fdfa8dc0f70e23bf9fae4cc Mon Sep 17 00:00:00 2001 From: Jesse Glick <jglick@cloudbees.com> Date: Fri, 5 Jul 2019 15:52:33 -0400 Subject: [PATCH 17/17] Using improved assertion after #496. --- .../plugins/kubernetes/pipeline/KubernetesPipelineTest.java | 5 +---- .../plugins/kubernetes/pipeline/RestartPipelineTest.java | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java index cef66b283a..acf89efeef 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java @@ -353,10 +353,7 @@ public void terminatedPod() throws Exception { r.waitForMessage("+ sleep", b); deletePods(cloud.connect(), getLabels(this, name), false); r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); - // TODO could use waitForMessage after #496 - while (!JenkinsRule.getLog(b).contains(new ExecutorStepExecution.RemovedNodeCause().getShortDescription())) { - Thread.sleep(100); - } + r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b); } @Test diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java index a95e1d5055..ffb3129d94 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java @@ -212,10 +212,7 @@ public void terminatedPodAfterRestart() throws Exception { // Indeed we get two Reaper instances running, which independently remove the node. deletePods(cloud.connect(), getLabels(this, name), false); r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); - while (!JenkinsRule.getLog(b).contains(new ExecutorStepExecution.RemovedNodeCause().getShortDescription())) { - // TODO JenkinsRule.waitForMessage has a race condition w.r.t. the termination cause printed by WorkflowRun.finish - Thread.sleep(100); - } + r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b); // Currently the logic in ExecutorStepExecution cannot handle a Jenkins restart so it prints the following. // It does not matter since DurableTaskStep redundantly implements the same check. r.assertLogContains(" was deleted, but do not have a node body to cancel", b);