summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2019-02-12 15:20:50 +0100
committerHåkon Hallingstad <hakon@verizonmedia.com>2019-02-12 15:20:50 +0100
commit575cfc5f25de837d36c8882b1728ee773b83fa2a (patch)
treef8758b12498317b754d4b9ec9eb613f61590acff /node-repository
parent0247ccd083972cd433618002db6a0d23f13fb4e1 (diff)
Retire/fail hosts with failure reports
NodeFailer will: - fail a ready node if the host has failure reports. - set wantToRetire on an active parent host, and all children, if it has failure reports. Makes the NodeFailer operate as a NodeFailer agent to make that clearer in the history.
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java111
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java22
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Reports.java1
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java3
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java46
10 files changed, 174 insertions, 28 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
index b2597e9cc50..eaed94395ba 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
@@ -266,13 +266,13 @@ public final class Node {
* Returns a copy of this node with wantToRetire set to the given value and updated history.
* If given wantToRetire is equal to the current, the method is no-op.
*/
- public Node withWantToRetire(boolean wantToRetire, Instant at) {
+ public Node withWantToRetire(boolean wantToRetire, Agent agent, Instant at) {
if (wantToRetire == status.wantToRetire()) return this;
return new Builder(this)
.withStatus(status.withWantToRetire(wantToRetire))
// Also update history when we un-wantToRetire so the OperatorChangeApplicationMaintainer picks it
// up quickly
- .withHistoryEvent(History.Event.Type.wantToRetire, Agent.operator, at)
+ .withHistoryEvent(History.Event.Type.wantToRetire, agent, at)
.build();
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index 280f723f268..621ba2f778f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -99,7 +99,7 @@ public class FailedExpirer extends Maintainer {
private void recycle(List<Node> nodes) {
List<Node> nodesToRecycle = new ArrayList<>();
for (Node candidate : nodes) {
- if (hasHardwareIssue(candidate)) {
+ if (NodeFailer.hasHardwareIssue(candidate, nodes)) {
List<String> unparkedChildren = !candidate.type().isDockerHost() ? Collections.emptyList() :
nodeRepository.list().childrenOf(candidate).asList().stream()
.filter(node -> node.state() != Node.State.parked)
@@ -127,11 +127,4 @@ public class FailedExpirer extends Maintainer {
return (zone.environment() == Environment.prod || zone.environment() == Environment.staging) &&
node.status().failCount() >= maxAllowedFailures;
}
-
- /** Returns whether node has any kind of hardware issue */
- private static boolean hasHardwareIssue(Node node) {
- return node.status().hardwareFailureDescription().isPresent() ||
- node.status().hardwareDivergence().isPresent();
- }
-
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 6c430351a88..093ab680d97 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -2,6 +2,8 @@
package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.cloud.config.ConfigserverConfig;
+import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.ApplicationLockException;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.HostLivenessTracker;
@@ -27,12 +29,15 @@ import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
+import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
+import java.util.stream.Stream;
import static java.util.stream.Collectors.collectingAndThen;
import static java.util.stream.Collectors.counting;
@@ -105,14 +110,18 @@ public class NodeFailer extends Maintainer {
continue;
}
String reason = entry.getValue();
- nodeRepository().fail(node.hostname(), Agent.system, reason);
+ nodeRepository().fail(node.hostname(), Agent.NodeFailer, reason);
}
}
- // Active nodes
updateNodeDownState();
- for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason().entrySet()) {
+ List<Node> activeNodes = nodeRepository().getNodes(Node.State.active);
+ Set<Node> nodesWithFailureReason = new HashSet<>();
+
+ // Fail active nodes
+ for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason(activeNodes).entrySet()) {
Node node = entry.getKey();
+ nodesWithFailureReason.add(node);
if (!failAllowedFor(node.type())) {
continue;
}
@@ -124,6 +133,16 @@ public class NodeFailer extends Maintainer {
failActive(node, reason);
}
+ // Retire active nodes
+ for (Node node : activeNodes) {
+ if (!failAllowedFor(node.type())) continue;
+ if (nodesWithFailureReason.contains(node)) continue;
+ if (node.parentHostname().isPresent()) continue; // Defer to parent host (it should be active too)
+ List<String> reasons = reasonsToRetireActiveParentHost(node);
+ if (reasons.isEmpty()) continue;
+ retireRecursively(node, reasons, activeNodes);
+ }
+
metric.set(throttlingActiveMetric, Math.min( 1, throttledNodeFailures), null);
metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null);
}
@@ -137,7 +156,7 @@ public class NodeFailer extends Maintainer {
if (! node.history().hasEventAfter(History.Event.Type.requested, lastLocalRequest.get())) {
History updatedHistory = node.history()
- .with(new History.Event(History.Event.Type.requested, Agent.system, lastLocalRequest.get()));
+ .with(new History.Event(History.Event.Type.requested, Agent.NodeFailer, lastLocalRequest.get()));
nodeRepository().write(node.with(updatedHistory));
}
}
@@ -161,6 +180,16 @@ public class NodeFailer extends Maintainer {
nodesByFailureReason.put(node, "Node has hardware failure");
} else if (node.status().hardwareDivergence().isPresent()) {
nodesByFailureReason.put(node, "Node has hardware divergence");
+ } else {
+ Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
+ List<String> failureReports = reasonsToRetireActiveParentHost(hostNode);
+ if (failureReports.size() > 0) {
+ if (hostNode.equals(node)) {
+ nodesByFailureReason.put(node, "Host has failure reports: " + failureReports);
+ } else {
+ nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports);
+ }
+ }
}
}
return nodesByFailureReason;
@@ -187,19 +216,81 @@ public class NodeFailer extends Maintainer {
});
}
- private Map<Node, String> getActiveNodesByFailureReason() {
+ private Map<Node, String> getActiveNodesByFailureReason(List<Node> activeNodes) {
Instant graceTimeEnd = clock.instant().minus(downTimeLimit);
Map<Node, String> nodesByFailureReason = new HashMap<>();
- for (Node node : nodeRepository().getNodes(Node.State.active)) {
+ for (Node node : activeNodes) {
if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
} else if (node.status().hardwareFailureDescription().isPresent() && nodeSuspended(node)) {
- nodesByFailureReason.put(node, "Node has hardware failure");
+ nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get());
}
}
return nodesByFailureReason;
}
+ private static List<String> reasonsToRetireActiveParentHost(Node hostNode) {
+ return Stream.of("badTotalMemorySize", "badTotalDiskSize")
+ .map(reportId -> baseReportToString(hostNode, reportId))
+ .flatMap(Optional::stream)
+ .collect(Collectors.toList());
+ }
+
+ /** The generated string is built from the report's ID, created time, and description only. */
+ static Optional<String> baseReportToString(Node node, String reportId) {
+ return node.reports().getReport(reportId).map(report ->
+ reportId + " reported " + report.getCreatedTime() + ": " + report.getDescription());
+ }
+
+ /**
+ * There are reasons why this node should be parked, and we'd like to do it through retiring,
+ * including any child nodes.
+ */
+ private void retireRecursively(Node node, List<String> reasons, List<Node> activeNodesIfMaybeParent) {
+ if (activeNodesIfMaybeParent != null) {
+ List<Node> childNodesToRetire = activeNodesIfMaybeParent.stream()
+ .filter(n -> n.parentHostname().equals(Optional.of(node.hostname())))
+ .collect(Collectors.toList());
+ for (Node childNode : childNodesToRetire) {
+ retireRecursively(childNode, reasons, null);
+ }
+ }
+
+ if (node.status().wantToRetire()) return;
+ retireActive(node.hostname(), node.allocation().get().owner(), reasons);
+ }
+
+ private void retireActive(String hostname, ApplicationId owner, List<String> reasons) {
+ // Getting the application lock can take a very long time for the largest applications.
+ // Don't bother waiting for too long since retries is automatic with maintainers.
+ Duration lockWait = Duration.ofMinutes(1);
+ try (Mutex lock = nodeRepository().lock(owner, lockWait)) {
+ // Recheck all conditions in case anything has changed
+ Optional<Node> node = nodeRepository().getNode(hostname);
+ if (node.isEmpty()) return;
+ if (node.get().state() != Node.State.active) return;
+ if (!node.get().allocation().orElseThrow().owner().equals(owner)) return;
+ if (node.get().status().wantToRetire()) return;
+
+ log.info("Setting wantToRetire on " + node.get() + " due to these reports: " + reasons);
+ nodeRepository().write(node.get().withWantToRetire(true, Agent.NodeFailer, clock.instant()));
+ } catch (ApplicationLockException e) {
+ log.warning("Failed to get lock on " + owner + " within " + lockWait + " to set wantToRetire, will retry later");
+ }
+ }
+
+ /** Returns whether node has any kind of hardware issue */
+ public static boolean hasHardwareIssue(Node node, List<Node> nodes) {
+ if (node.status().hardwareFailureDescription().isPresent() || node.status().hardwareDivergence().isPresent()) {
+ return true;
+ }
+
+ Node hostNode = node.parentHostname()
+ .flatMap(parent -> nodes.stream().filter(n -> n.hostname().equals(parent)).findFirst())
+ .orElse(node);
+ return reasonsToRetireActiveParentHost(hostNode).size() > 0;
+ }
+
private boolean expectConfigRequests(Node node) {
return !node.type().isDockerHost() || configserverConfig.nodeAdminInContainer();
}
@@ -302,12 +393,12 @@ public class NodeFailer extends Maintainer {
if (failingTenantNode.state() == Node.State.active) {
allTenantNodesFailedOutSuccessfully &= failActive(failingTenantNode, reasonForChildFailure);
} else {
- nodeRepository().fail(failingTenantNode.hostname(), Agent.system, reasonForChildFailure);
+ nodeRepository().fail(failingTenantNode.hostname(), Agent.NodeFailer, reasonForChildFailure);
}
}
if (! allTenantNodesFailedOutSuccessfully) return false;
- node = nodeRepository().fail(node.hostname(), Agent.system, reason);
+ node = nodeRepository().fail(node.hostname(), Agent.NodeFailer, reason);
try {
deployment.get().activate();
return true;
@@ -315,7 +406,7 @@ public class NodeFailer extends Maintainer {
catch (RuntimeException e) {
// The expected reason for deployment to fail here is that there is no capacity available to redeploy.
// In that case we should leave the node in the active state to avoid failing additional nodes.
- nodeRepository().reactivate(node.hostname(), Agent.system,
+ nodeRepository().reactivate(node.hostname(), Agent.NodeFailer,
"Failed to redeploy after being failed by NodeFailer");
log.log(Level.WARNING, "Attempted to fail " + node + " for " + node.allocation().get().owner() +
", but redeploying without the node failed", e);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index 5774176956a..2c784de06ca 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -196,7 +196,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
if (zone.environment().equals(Environment.prod) && zone.system() != SystemName.cd) {
inactiveExpiry = Duration.ofHours(4); // enough time for the application owner to discover and redeploy
- retiredInterval = Duration.ofMinutes(29);
+ retiredInterval = Duration.ofMinutes(10);
dirtyExpiry = Duration.ofHours(2); // enough time to clean the node
retiredExpiry = Duration.ofDays(4); // give up migrating data after 4 days
} else {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java
index d437b90a7f7..812c370df5f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java
@@ -7,5 +7,5 @@ package com.yahoo.vespa.hosted.provision.node;
* @author bratseth
*/
public enum Agent {
- system, application, operator, NodeRetirer
+ system, application, operator, NodeRetirer, NodeFailer
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java
index 3ccf617ce60..1e4c4503b16 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.node;
import com.yahoo.slime.Cursor;
import com.yahoo.slime.Inspector;
+import com.yahoo.slime.Slime;
import com.yahoo.vespa.config.SlimeUtils;
import java.time.Instant;
@@ -22,14 +23,16 @@ public class Report {
private final String reportId;
private final Instant createdTime;
+ private final String description;
// This is the serialized report which is typically richer than we know of in this class. It's up to
// clients of specific reports to query the details.
private final Inspector reportInspector;
- private Report(String reportId, Instant createdTime, Inspector reportInspector) {
+ private Report(String reportId, Instant createdTime, String description, Inspector reportInspector) {
this.reportId = reportId;
this.createdTime = createdTime;
+ this.description = description;
this.reportInspector = reportInspector;
}
@@ -45,13 +48,18 @@ public class Report {
public boolean shouldFailNode() { return !getDescription().isEmpty(); }
/** A textual summary of the report. */
- public String getDescription() { return reportInspector.field(DESCRIPTION_FIELD).asString(); }
+ public String getDescription() { return description; }
/** For exploring the JSON (Slime) of the report. */
public Inspector getInspector() {
return reportInspector;
}
+ /** Create the simplest possible report. */
+ public static Report basicReport(String reportId, Instant createdTime, String description) {
+ return new Report(reportId, createdTime, description, new Slime().setObject());
+ }
+
/** The reportInspector will be used to serialize the full report later, including any createdTime and description. */
public static Report fromSlime(String reportId, Inspector reportInspector) {
long millisSinceEpoch = reportInspector.field(CREATED_FIELD).asLong();
@@ -61,14 +69,18 @@ public class Report {
}
Instant createdTime = Instant.ofEpochMilli(millisSinceEpoch);
- return new Report(reportId, createdTime, reportInspector);
+ String description = reportInspector.field(DESCRIPTION_FIELD).asString();
+
+ return new Report(reportId, createdTime, description, reportInspector);
}
public void toSlime(Cursor reportCursor) {
SlimeUtils.copyObject(reportInspector, reportCursor);
- // If the above inject inserted the created timestamp field, this is a no-op, which is what we want:
- // We'd like the created field to be set the first time we see it, if it is not already set then.
+ // In Slime, trying to overwrite an already existing field is a no-op.
+ // We'll write the required fields now. If they weren't already set by the above copyObject,
+ // in particular the created field, the below will be set it to the current timestamp which is what we want.
reportCursor.setLong(CREATED_FIELD, createdTime.toEpochMilli());
+ if (!description.isEmpty()) reportCursor.setString(DESCRIPTION_FIELD, description);
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Reports.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Reports.java
index 721b9c1cbd9..fd6094ae111 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Reports.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Reports.java
@@ -25,6 +25,7 @@ public class Reports {
public boolean isEmpty() { return reports.isEmpty(); }
public Optional<Report> getReport(String id) { return Optional.ofNullable(reports.get(id)); }
public List<Report> getReports() { return List.copyOf(reports.values()); }
+ public Reports withReport(Report report) { return new Builder(this).setReport(report).build(); }
public void toSlime(Cursor reportsParentObjectCursor, String reportsName) {
if (reports.isEmpty()) return;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
index 7f1beee27b5..9e3bfc821ec 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
@@ -323,6 +323,7 @@ public class NodeSerializer {
case "system" : return Agent.system;
case "operator" : return Agent.operator;
case "NodeRetirer" : return Agent.NodeRetirer;
+ case "NodeFailer" : return Agent.NodeFailer;
}
throw new IllegalArgumentException("Unknown node event agent '" + eventAgentField.asString() + "'");
}
@@ -332,6 +333,7 @@ public class NodeSerializer {
case system : return "system";
case operator : return "operator";
case NodeRetirer : return "NodeRetirer";
+ case NodeFailer : return "NodeFailer";
}
throw new IllegalArgumentException("Serialized form of '" + agent + "' not defined");
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java
index 28ba6a59d41..4dfa20c6267 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java
@@ -11,6 +11,7 @@ import com.yahoo.slime.Type;
import com.yahoo.vespa.config.SlimeUtils;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.Report;
import com.yahoo.vespa.hosted.provision.node.Reports;
@@ -135,7 +136,7 @@ public class NodePatcher {
case "additionalIpAddresses" :
return node.withIpAddressPool(asStringSet(value));
case WANT_TO_RETIRE :
- return node.withWantToRetire(asBoolean(value), nodeRepository.clock().instant());
+ return node.withWantToRetire(asBoolean(value), Agent.operator, nodeRepository.clock().instant());
case WANT_TO_DEPROVISION :
return node.with(node.status().withWantToDeprovision(asBoolean(value)));
case "hardwareDivergence" :
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 2374eda8bca..b0466885931 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -8,9 +8,11 @@ import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.node.Report;
import org.junit.Test;
import java.time.Duration;
+import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -95,6 +97,50 @@ public class NodeFailerTest {
}
@Test
+ public void set_want_to_retire_if_failure_report() {
+ NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(6);
+ String dockerHostWithFailureReport = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2);
+
+ // Set failure report to the parent and all its children.
+ Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low");
+ tester.nodeRepository.getNodes().stream()
+ .filter(node -> node.hostname().equals(dockerHostWithFailureReport))
+ .forEach(node -> {
+ Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport));
+ tester.nodeRepository.write(updatedNode);
+ });
+
+ {
+ // The host should have 2 nodes in active and 1 ready
+ List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList();
+ Map<Node.State, List<String>> hostnamesByState = childNodes.stream()
+ .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList())));
+ assertEquals(2, hostnamesByState.get(Node.State.active).size());
+ assertEquals(1, hostnamesByState.get(Node.State.ready).size());
+ }
+
+ tester.failer.run();
+ tester.clock.advance(Duration.ofHours(25));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.failer.run();
+
+ {
+ List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList();
+ assertEquals(3, childNodes.size());
+
+ // The active nodes -> wantToRetire
+ List<Node> activeChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.active).collect(Collectors.toList());
+ assertEquals(2, activeChildNodes.size());
+ assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire()));
+
+ // The ready node -> failed
+ List<Node> failedChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.failed).collect(Collectors.toList());
+ assertEquals(1, failedChildNodes.size());
+ assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire()));
+ }
+ }
+
+ @Test
public void nodes_for_suspended_applications_are_not_failed() {
NodeFailTester tester = NodeFailTester.withTwoApplications();
tester.suspend(NodeFailTester.app1);