Skip to content

Commit 1891ab0

Browse files
authored
Merge pull request #372 from Vlatombe/fail-fast-on-cloud-node-removal
Fail fast on cloud node removal
2 parents 295673a + 04e4192 commit 1891ab0

File tree

7 files changed

+138
-58
lines changed

7 files changed

+138
-58
lines changed

pom.xml

+6
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,12 @@
161161
<version>1.2</version>
162162
<scope>test</scope>
163163
</dependency>
164+
<dependency>
165+
<groupId>org.jenkins-ci.plugins</groupId>
166+
<artifactId>mock-slave</artifactId>
167+
<version>153.v9768799a_2294</version>
168+
<scope>test</scope>
169+
</dependency>
164170
<dependency>
165171
<groupId>org.awaitility</groupId>
166172
<artifactId>awaitility</artifactId>

src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab
367367
} else {
368368
LOGGER.fine(() -> "rediscovering that " + node + " has been removed and timeout has expired");
369369
listener().getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back");
370-
throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeCause());
370+
throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause());
371371
}
372372
}
373373
removedNodeDiscovered = 0; // something else; reset

src/main/java/org/jenkinsci/plugins/workflow/support/pickles/ExecutorPickle.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ protected Executor tryResolve() throws Exception {
122122
Queue.getInstance().cancel(item);
123123
owner.getListener().getLogger().printf("Killed %s after waiting for %s because we assume unknown agent %s is never going to appear%n",
124124
item.task.getDisplayName(), Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS), placeholder.getAssignedLabel());
125-
throw new FlowInterruptedException(Result.ABORTED, new ExecutorStepExecution.RemovedNodeCause());
125+
throw new FlowInterruptedException(Result.ABORTED, new ExecutorStepExecution.RemovedNodeTimeoutCause());
126126
}
127127
}
128128
}

src/main/java/org/jenkinsci/plugins/workflow/support/steps/AgentErrorCondition.java

+7-2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import java.io.IOException;
3737
import java.nio.channels.ClosedChannelException;
3838
import java.util.stream.Stream;
39+
import jenkins.model.CauseOfInterruption;
3940
import org.jenkinsci.Symbol;
4041
import org.jenkinsci.plugins.workflow.flow.ErrorCondition;
4142
import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException;
@@ -57,8 +58,7 @@ public final class AgentErrorCondition extends ErrorCondition {
5758
if (t instanceof AgentOfflineException) {
5859
return true;
5960
}
60-
if (t instanceof FlowInterruptedException && ((FlowInterruptedException) t).getCauses().stream().anyMatch(
61-
c -> c instanceof ExecutorStepExecution.RemovedNodeCause || c instanceof ExecutorStepExecution.QueueTaskCancelled)) {
61+
if (t instanceof FlowInterruptedException && ((FlowInterruptedException) t).getCauses().stream().anyMatch(Retryable.class::isInstance)) {
6262
return true;
6363
}
6464
if (isClosedChannelException(t)) {
@@ -90,6 +90,11 @@ private static boolean isClosedChannelException(Throwable t) {
9090
}
9191
}
9292

93+
/**
94+
* A marker interface for {@link CauseOfInterruption} instances that can be retried through {@link AgentErrorCondition}.
95+
*/
96+
public interface Retryable {}
97+
9398
@Symbol("agent")
9499
@Extension public static final class DescriptorImpl extends ErrorConditionDescriptor {
95100

src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ void resume(StepContext context) throws Exception {
107107
exec = item.getFuture().getStartCondition().get(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS);
108108
} catch (TimeoutException x) {
109109
listener.getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back");
110-
throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeCause());
110+
throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause());
111111
} catch (CancellationException x) {
112112
LOGGER.log(Level.FINE, "ceased to wait for " + node, x);
113113
throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.QueueTaskCancelled());

src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java

+61-34
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import hudson.model.ResourceList;
2727
import hudson.model.Result;
2828
import hudson.model.Run;
29+
import hudson.model.Slave;
2930
import hudson.model.TaskListener;
3031
import hudson.model.TopLevelItem;
3132
import hudson.model.User;
@@ -38,6 +39,7 @@
3839
import hudson.security.ACLContext;
3940
import hudson.security.AccessControlled;
4041
import hudson.security.Permission;
42+
import hudson.slaves.AbstractCloudSlave;
4143
import hudson.slaves.OfflineCause;
4244
import hudson.slaves.WorkspaceList;
4345
import java.io.IOException;
@@ -74,6 +76,7 @@
7476
import org.acegisecurity.Authentication;
7577
import org.jenkinsci.plugins.durabletask.executors.ContinuableExecutable;
7678
import org.jenkinsci.plugins.durabletask.executors.ContinuedTask;
79+
import org.jenkinsci.plugins.durabletask.executors.OnceRetentionStrategy;
7780
import org.jenkinsci.plugins.workflow.actions.LabelAction;
7881
import org.jenkinsci.plugins.workflow.actions.QueueItemAction;
7982
import org.jenkinsci.plugins.workflow.actions.ThreadNameAction;
@@ -334,7 +337,7 @@ public void stop(@NonNull Throwable cause) throws Exception {
334337

335338
}
336339

337-
public static final class QueueTaskCancelled extends CauseOfInterruption {
340+
public static final class QueueTaskCancelled extends RetryableCauseOfInterruption {
338341
@Override public String getShortDescription() {
339342
return Messages.ExecutorStepExecution_queue_task_cancelled();
340343
}
@@ -346,51 +349,75 @@ public static final class QueueTaskCancelled extends CauseOfInterruption {
346349
return;
347350
}
348351
LOGGER.fine(() -> "received node deletion event on " + node.getNodeName());
349-
Timer.get().schedule(() -> {
350-
Computer c = node.toComputer();
351-
if (c == null || c.isOnline()) {
352-
LOGGER.fine(() -> "computer for " + node.getNodeName() + " was missing or online, skipping");
353-
return;
354-
}
355-
LOGGER.fine(() -> "processing node deletion event on " + node.getNodeName());
356-
for (Executor e : c.getExecutors()) {
357-
Queue.Executable exec = e.getCurrentExecutable();
358-
if (exec instanceof PlaceholderTask.PlaceholderExecutable) {
359-
PlaceholderTask task = ((PlaceholderTask.PlaceholderExecutable) exec).getParent();
360-
TaskListener listener;
361-
try {
362-
listener = task.context.get(TaskListener.class);
363-
} catch (Exception x) {
364-
LOGGER.log(Level.WARNING, null, x);
365-
continue;
366-
}
367-
task.withExecution(execution -> {
368-
BodyExecution body = execution.body;
369-
if (body == null) {
370-
listener.getLogger().println("Agent " + node.getNodeName() + " was deleted, but do not have a node body to cancel");
371-
return;
372-
}
373-
listener.getLogger().println("Agent " + node.getNodeName() + " was deleted; cancelling node body");
374-
if (Util.isOverridden(BodyExecution.class, body.getClass(), "cancel", Throwable.class)) {
375-
body.cancel(new FlowInterruptedException(Result.ABORTED, false, new RemovedNodeCause()));
376-
} else { // TODO remove once https://github.com/jenkinsci/workflow-cps-plugin/pull/570 is widely deployed
377-
body.cancel(new RemovedNodeCause());
378-
}
379-
});
352+
if (isOneShotAgent(node)) {
353+
LOGGER.fine(() -> "Cancelling owner run for one-shot agent " + node.getNodeName() + " immediately");
354+
cancelOwnerExecution(node, new RemovedNodeCause());
355+
} else {
356+
LOGGER.fine(() -> "Will cancel owner run for agent " + node.getNodeName() + " after waiting for " + TIMEOUT_WAITING_FOR_NODE_MILLIS + "ms");
357+
Timer.get().schedule(() -> cancelOwnerExecution(node, new RemovedNodeCause()), TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS);
358+
}
359+
}
360+
361+
private static boolean isOneShotAgent(Node node) {
362+
return node instanceof AbstractCloudSlave ||
363+
(node instanceof Slave && ((Slave) node).getRetentionStrategy() instanceof OnceRetentionStrategy);
364+
}
365+
366+
private static void cancelOwnerExecution(Node node, CauseOfInterruption... causes) {
367+
Computer c = node.toComputer();
368+
if (c == null || c.isOnline()) {
369+
LOGGER.fine(() -> "computer for " + node.getNodeName() + " was missing or online, skipping");
370+
return;
371+
}
372+
LOGGER.fine(() -> "processing node deletion event on " + node.getNodeName());
373+
for (Executor e : c.getExecutors()) {
374+
Queue.Executable exec = e.getCurrentExecutable();
375+
if (exec instanceof PlaceholderTask.PlaceholderExecutable) {
376+
PlaceholderTask task = ((PlaceholderTask.PlaceholderExecutable) exec).getParent();
377+
TaskListener listener;
378+
try {
379+
listener = task.context.get(TaskListener.class);
380+
} catch (Exception x) {
381+
LOGGER.log(Level.WARNING, null, x);
382+
continue;
380383
}
384+
task.withExecution(execution -> {
385+
BodyExecution body = execution.body;
386+
if (body == null) {
387+
listener.getLogger().println("Agent " + node.getNodeName() + " was deleted, but do not have a node body to cancel");
388+
return;
389+
}
390+
listener.getLogger().println("Agent " + node.getNodeName() + " was deleted; cancelling node body");
391+
if (Util.isOverridden(BodyExecution.class, body.getClass(), "cancel", Throwable.class)) {
392+
body.cancel(new FlowInterruptedException(Result.ABORTED, false, causes));
393+
} else { // TODO remove once https://github.com/jenkinsci/workflow-cps-plugin/pull/570 is widely deployed
394+
body.cancel(causes);
395+
}
396+
});
381397
}
382-
}, TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS);
398+
}
383399
}
384400
}
385401

386-
public static final class RemovedNodeCause extends CauseOfInterruption {
402+
public static final class RemovedNodeCause extends RetryableCauseOfInterruption {
387403
@SuppressFBWarnings(value = "MS_SHOULD_BE_FINAL", justification = "deliberately mutable")
388404
public static boolean ENABLED = Boolean.parseBoolean(System.getProperty(ExecutorStepExecution.class.getName() + ".REMOVED_NODE_DETECTION", "true"));
389405
@Override public String getShortDescription() {
390406
return "Agent was removed";
391407
}
392408
}
393409

410+
public static final class RemovedNodeTimeoutCause extends RetryableCauseOfInterruption {
411+
@Override public String getShortDescription() {
412+
return "Timeout waiting for agent to come back";
413+
}
414+
}
415+
416+
/**
417+
* Base class for a cause of interruption that can be retried via {@link AgentErrorCondition}.
418+
*/
419+
private abstract static class RetryableCauseOfInterruption extends CauseOfInterruption implements AgentErrorCondition.Retryable {}
420+
394421
/** Transient handle of a running executor task. */
395422
private static final class RunningTask {
396423
/** null until placeholder executable runs */

src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java

+61-19
Original file line numberDiff line numberDiff line change
@@ -24,33 +24,38 @@
2424

2525
package org.jenkinsci.plugins.workflow.support.steps;
2626

27+
import static org.awaitility.Awaitility.await;
28+
import static org.hamcrest.MatcherAssert.assertThat;
29+
import static org.hamcrest.Matchers.anyOf;
30+
import static org.hamcrest.Matchers.arrayWithSize;
31+
import static org.hamcrest.Matchers.contains;
32+
import static org.hamcrest.Matchers.emptyArray;
33+
import static org.hamcrest.Matchers.hasSize;
34+
import static org.hamcrest.Matchers.isA;
35+
import static org.junit.Assert.assertEquals;
36+
import static org.junit.Assert.assertNotNull;
37+
2738
import hudson.model.Label;
2839
import hudson.model.Queue;
2940
import hudson.model.Result;
3041
import hudson.slaves.DumbSlave;
3142
import hudson.slaves.RetentionStrategy;
3243
import java.io.File;
44+
import java.time.Duration;
3345
import java.util.ArrayList;
3446
import java.util.List;
3547
import java.util.logging.Level;
3648
import jenkins.model.InterruptedBuildAction;
37-
import static org.hamcrest.MatcherAssert.assertThat;
38-
import static org.hamcrest.Matchers.contains;
39-
import static org.hamcrest.Matchers.emptyArray;
40-
import static org.hamcrest.Matchers.isA;
41-
import static org.hamcrest.Matchers.anyOf;
49+
import org.jenkinci.plugins.mock_slave.MockCloud;
50+
import org.jenkinsci.plugins.durabletask.executors.OnceRetentionStrategy;
4251
import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition;
4352
import org.jenkinsci.plugins.workflow.flow.FlowExecutionList;
4453
import org.jenkinsci.plugins.workflow.job.WorkflowJob;
4554
import org.jenkinsci.plugins.workflow.job.WorkflowRun;
4655
import org.jenkinsci.plugins.workflow.test.steps.SemaphoreStep;
47-
import static org.junit.Assert.assertEquals;
48-
import static org.junit.Assert.assertNotNull;
49-
import static org.junit.Assert.assertNull;
5056
import org.junit.ClassRule;
5157
import org.junit.Rule;
5258
import org.junit.Test;
53-
import org.junit.rules.TemporaryFolder;
5459
import org.jvnet.hudson.test.BuildWatcher;
5560
import org.jvnet.hudson.test.Issue;
5661
import org.jvnet.hudson.test.JenkinsSessionRule;
@@ -60,9 +65,12 @@ public class ExecutorStepDynamicContextTest {
6065

6166
@ClassRule public static BuildWatcher buildWatcher = new BuildWatcher();
6267
@Rule public JenkinsSessionRule sessions = new JenkinsSessionRule();
63-
@Rule public TemporaryFolder tmp = new TemporaryFolder();
6468
@Rule public LoggerRule logging = new LoggerRule();
6569

70+
private void commonSetup() {
71+
logging.recordPackage(ExecutorStepExecution.class, Level.FINE).record(FlowExecutionList.class, Level.FINE);
72+
}
73+
6674
@Test public void canceledQueueItem() throws Throwable {
6775
sessions.then(j -> {
6876
DumbSlave s = j.createSlave(Label.get("remote"));
@@ -75,11 +83,7 @@ public class ExecutorStepDynamicContextTest {
7583
sessions.then(j -> {
7684
SemaphoreStep.success("wait/1", null);
7785
WorkflowRun b = j.jenkins.getItemByFullName("p", WorkflowJob.class).getBuildByNumber(1);
78-
while (Queue.getInstance().getItems().length == 0) {
79-
Thread.sleep(100);
80-
}
81-
Queue.Item[] items = Queue.getInstance().getItems();
82-
assertEquals(1, items.length);
86+
var items = await().timeout(Duration.ofMinutes(1)).until(() -> j.jenkins.getQueue().getItems(), arrayWithSize(1));
8387
Queue.getInstance().cancel(items[0]);
8488
j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(b));
8589
InterruptedBuildAction iba = b.getAction(InterruptedBuildAction.class);
@@ -98,7 +102,7 @@ public class ExecutorStepDynamicContextTest {
98102
*/
99103
@Issue("JENKINS-36013")
100104
@Test public void normalNodeDisappearance() throws Throwable {
101-
logging.recordPackage(ExecutorStepExecution.class, Level.FINE).record(FlowExecutionList.class, Level.FINE);
105+
commonSetup();
102106
sessions.then(j -> {
103107
// Start up a build that needs executor and then reboot and take the node offline
104108
// Starting job first ensures we don't immediately fail if Node comes from a Cloud
@@ -114,20 +118,19 @@ public class ExecutorStepDynamicContextTest {
114118
sessions.then(j -> {
115119
// Start up a build and then reboot and take the node offline
116120
assertEquals(0, j.jenkins.getLabel("ghost").getNodes().size()); // Make sure test impl is correctly deleted
117-
assertNull(j.jenkins.getNode("ghost")); // Make sure test impl is correctly deleted
118121
WorkflowRun run = j.jenkins.getItemByFullName("p", WorkflowJob.class).getLastBuild();
119122
j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(run));
120123
j.assertLogContains("slave0 has been removed for ", run);
121124
assertThat(j.jenkins.getQueue().getItems(), emptyArray());
122125
InterruptedBuildAction iba = run.getAction(InterruptedBuildAction.class);
123126
assertNotNull(iba);
124-
assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeCause.class)));
127+
assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeTimeoutCause.class)));
125128
});
126129
}
127130

128131
@Issue("JENKINS-36013")
129132
@Test public void parallelNodeDisappearance() throws Throwable {
130-
logging.recordPackage(ExecutorStepExecution.class, Level.FINE).record(FlowExecutionList.class, Level.FINE);
133+
commonSetup();
131134
sessions.then(j -> {
132135
WorkflowJob p = j.createProject(WorkflowJob.class, "p");
133136
p.setDefinition(new CpsFlowDefinition("def bs = [:]; for (int _i = 0; _i < 5; _i++) {def i = _i; bs[/b$i/] = {node('remote') {semaphore(/s$i/)}}}; parallel bs", true));
@@ -207,4 +210,43 @@ public class ExecutorStepDynamicContextTest {
207210
});
208211
}
209212

213+
@Test public void onceRetentionStrategyNodeDisappearance() throws Throwable {
214+
commonSetup();
215+
sessions.then(j -> {
216+
DumbSlave s = j.createSlave(Label.get("ghost"));
217+
s.setRetentionStrategy(new OnceRetentionStrategy(0));
218+
WorkflowJob p = j.createProject(WorkflowJob.class, "p");
219+
p.setDefinition(new CpsFlowDefinition("node('ghost') {if (isUnix()) {sh 'sleep infinity'} else {bat 'echo + sleep infinity && ping -n 999999 localhost'}}", true));
220+
var run = p.scheduleBuild2(0).waitForStart();
221+
j.waitForMessage("+ sleep infinity", run);
222+
j.jenkins.removeNode(s);
223+
j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(run));
224+
assertThat(j.jenkins.getQueue().getItems(), emptyArray());
225+
InterruptedBuildAction iba = run.getAction(InterruptedBuildAction.class);
226+
assertNotNull(iba);
227+
assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeCause.class)));
228+
});
229+
}
230+
231+
@Test public void cloudNodeDisappearance() throws Throwable {
232+
commonSetup();
233+
sessions.then(j -> {
234+
var mockCloud = new MockCloud("mock");
235+
mockCloud.setLabels("mock");
236+
j.jenkins.clouds.add(mockCloud);
237+
WorkflowJob p = j.createProject(WorkflowJob.class, "p");
238+
p.setDefinition(new CpsFlowDefinition("node('mock') {if (isUnix()) {sh 'sleep infinity'} else {bat 'echo + sleep infinity && ping -n 999999 localhost'}}", true));
239+
WorkflowRun run = p.scheduleBuild2(0).waitForStart();
240+
j.waitForMessage("+ sleep infinity", run);
241+
var mockNodes = j.jenkins.getLabel("mock").getNodes();
242+
assertThat(mockNodes, hasSize(1));
243+
var mockNode = mockNodes.iterator().next();
244+
j.jenkins.removeNode(mockNode);
245+
j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(run));
246+
assertThat(j.jenkins.getQueue().getItems(), emptyArray());
247+
InterruptedBuildAction iba = run.getAction(InterruptedBuildAction.class);
248+
assertNotNull(iba);
249+
assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeCause.class)));
250+
});
251+
}
210252
}

0 commit comments

Comments
 (0)