diff options
author | HÃ¥kon Hallingstad <hakon@oath.com> | 2019-02-13 17:57:47 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-02-13 17:57:47 +0100 |
commit | acb90e7599dccdcd0d5b0969adba4431a5cc82c9 (patch) | |
tree | c5ff109d8e7313527712b36b75a48f2dade956f6 /node-repository | |
parent | c07f31482aabdf2c9b6db4f6c2a7ea88c339ac21 (diff) | |
parent | 5c729d7f6c0a9324621aaa497349ef87eb51aac4 (diff) |
Merge pull request #8478 from vespa-engine/hakonhall/retirefail-hosts-with-failure-reports
Retire/fail hosts with failure reports
Diffstat (limited to 'node-repository')
10 files changed, 237 insertions, 28 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index e2d4aaca59d..a52ffb11fb7 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -278,13 +278,13 @@ public final class Node { * Returns a copy of this node with wantToRetire set to the given value and updated history. * If given wantToRetire is equal to the current, the method is no-op. */ - public Node withWantToRetire(boolean wantToRetire, Instant at) { + public Node withWantToRetire(boolean wantToRetire, Agent agent, Instant at) { if (wantToRetire == status.wantToRetire()) return this; return new Builder(this) .withStatus(status.withWantToRetire(wantToRetire)) // Also update history when we un-wantToRetire so the OperatorChangeApplicationMaintainer picks it // up quickly - .withHistoryEvent(History.Event.Type.wantToRetire, Agent.operator, at) + .withHistoryEvent(History.Event.Type.wantToRetire, agent, at) .build(); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index 280f723f268..52ef5b4d1c6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -99,7 +99,7 @@ public class FailedExpirer extends Maintainer { private void recycle(List<Node> nodes) { List<Node> nodesToRecycle = new ArrayList<>(); for (Node candidate : nodes) { - if (hasHardwareIssue(candidate)) { + if (NodeFailer.hasHardwareIssue(candidate, nodeRepository)) { List<String> unparkedChildren = !candidate.type().isDockerHost() ? Collections.emptyList() : nodeRepository.list().childrenOf(candidate).asList().stream() .filter(node -> node.state() != Node.State.parked) @@ -127,11 +127,4 @@ public class FailedExpirer extends Maintainer { return (zone.environment() == Environment.prod || zone.environment() == Environment.staging) && node.status().failCount() >= maxAllowedFailures; } - - /** Returns whether node has any kind of hardware issue */ - private static boolean hasHardwareIssue(Node node) { - return node.status().hardwareFailureDescription().isPresent() || - node.status().hardwareDivergence().isPresent(); - } - } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 6c430351a88..f6933f7f585 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -2,6 +2,8 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.cloud.config.ConfigserverConfig; +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.ApplicationLockException; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.HostLivenessTracker; @@ -26,13 +28,17 @@ import com.yahoo.vespa.service.monitor.ServiceMonitor; import java.time.Clock; import java.time.Duration; import java.time.Instant; +import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; +import java.util.stream.Stream; import static java.util.stream.Collectors.collectingAndThen; import static java.util.stream.Collectors.counting; @@ -105,14 +111,18 @@ public class NodeFailer extends Maintainer { continue; } String reason = entry.getValue(); - nodeRepository().fail(node.hostname(), Agent.system, reason); + nodeRepository().fail(node.hostname(), Agent.NodeFailer, reason); } } - // Active nodes updateNodeDownState(); - for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason().entrySet()) { + List<Node> activeNodes = nodeRepository().getNodes(Node.State.active); + Set<Node> nodesWithFailureReason = new HashSet<>(); + + // Fail active nodes + for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason(activeNodes).entrySet()) { Node node = entry.getKey(); + nodesWithFailureReason.add(node); if (!failAllowedFor(node.type())) { continue; } @@ -124,6 +134,28 @@ public class NodeFailer extends Maintainer { failActive(node, reason); } + // Retire active hosts and their children. + activeNodes.stream() + .filter(node -> failAllowedFor(node.type())) + .filter(node -> !nodesWithFailureReason.contains(node)) + // Defer to parent host (it should also be active) + .filter(node -> node.parentHostname().isEmpty()) + // This will sort those with wantToRetire first + .sorted(Comparator.comparing(node -> node.status().wantToRetire(), Comparator.reverseOrder())) + .filter(node -> { + if (node.status().wantToRetire()) return true; + if (node.allocation().map(a -> a.membership().retired()).orElse(false)) return true; + List<String> reasons = reasonsToRetireActiveParentHost(node); + if (reasons.size() > 0) { + retireRecursively(node, reasons, activeNodes); + return true; + } + return false; + }) + // Only allow 1 active host to be wantToRetire at a time for rate limiting. + .limit(1) + .count(); + metric.set(throttlingActiveMetric, Math.min( 1, throttledNodeFailures), null); metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null); } @@ -137,7 +169,7 @@ public class NodeFailer extends Maintainer { if (! node.history().hasEventAfter(History.Event.Type.requested, lastLocalRequest.get())) { History updatedHistory = node.history() - .with(new History.Event(History.Event.Type.requested, Agent.system, lastLocalRequest.get())); + .with(new History.Event(History.Event.Type.requested, Agent.NodeFailer, lastLocalRequest.get())); nodeRepository().write(node.with(updatedHistory)); } } @@ -161,6 +193,16 @@ public class NodeFailer extends Maintainer { nodesByFailureReason.put(node, "Node has hardware failure"); } else if (node.status().hardwareDivergence().isPresent()) { nodesByFailureReason.put(node, "Node has hardware divergence"); + } else { + Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); + List<String> failureReports = reasonsToRetireActiveParentHost(hostNode); + if (failureReports.size() > 0) { + if (hostNode.equals(node)) { + nodesByFailureReason.put(node, "Host has failure reports: " + failureReports); + } else { + nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports); + } + } } } return nodesByFailureReason; @@ -187,19 +229,85 @@ public class NodeFailer extends Maintainer { }); } - private Map<Node, String> getActiveNodesByFailureReason() { + private Map<Node, String> getActiveNodesByFailureReason(List<Node> activeNodes) { Instant graceTimeEnd = clock.instant().minus(downTimeLimit); Map<Node, String> nodesByFailureReason = new HashMap<>(); - for (Node node : nodeRepository().getNodes(Node.State.active)) { + for (Node node : activeNodes) { if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) { nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); } else if (node.status().hardwareFailureDescription().isPresent() && nodeSuspended(node)) { - nodesByFailureReason.put(node, "Node has hardware failure"); + nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get()); } } return nodesByFailureReason; } + private static List<String> reasonsToRetireActiveParentHost(Node hostNode) { + return Stream.of( + "badTotalMemorySize", + "badTotalDiskSize", + "badDiskType", + "badInterfaceSpeed", + "badCpuCount" + ) + .map(reportId -> baseReportToString(hostNode, reportId)) + .flatMap(Optional::stream) + .collect(Collectors.toList()); + } + + /** The generated string is built from the report's ID, created time, and description only. */ + static Optional<String> baseReportToString(Node node, String reportId) { + return node.reports().getReport(reportId).map(report -> + reportId + " reported " + report.getCreatedTime() + ": " + report.getDescription()); + } + + /** + * There are reasons why this node should be parked, and we'd like to do it through retiring, + * including any child nodes. + */ + private void retireRecursively(Node node, List<String> reasons, List<Node> activeNodes) { + if (activeNodes != null) { + List<Node> childNodesToRetire = activeNodes.stream() + .filter(n -> n.parentHostname().equals(Optional.of(node.hostname()))) + .collect(Collectors.toList()); + for (Node childNode : childNodesToRetire) { + retireRecursively(childNode, reasons, null); + } + } + + if (node.status().wantToRetire()) return; + retireActive(node.hostname(), node.allocation().get().owner(), reasons); + } + + private void retireActive(String hostname, ApplicationId owner, List<String> reasons) { + // Getting the application lock can take a very long time for the largest applications. + // Don't bother waiting for too long since retries is automatic with maintainers. + Duration lockWait = Duration.ofSeconds(10); + try (Mutex lock = nodeRepository().lock(owner, lockWait)) { + // Recheck all conditions in case anything has changed + Optional<Node> node = nodeRepository().getNode(hostname); + if (node.isEmpty()) return; + if (node.get().state() != Node.State.active) return; + if (!node.get().allocation().orElseThrow().owner().equals(owner)) return; + if (node.get().status().wantToRetire()) return; + + log.info("Setting wantToRetire on " + node.get() + " due to these reports: " + reasons); + nodeRepository().write(node.get().withWantToRetire(true, Agent.NodeFailer, clock.instant())); + } catch (ApplicationLockException e) { + log.warning("Failed to get lock on " + owner + " within " + lockWait + " to set wantToRetire, will retry later"); + } + } + + /** Returns whether node has any kind of hardware issue */ + public static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) { + if (node.status().hardwareFailureDescription().isPresent() || node.status().hardwareDivergence().isPresent()) { + return true; + } + + Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.getNode(parent)).orElse(node); + return reasonsToRetireActiveParentHost(hostNode).size() > 0; + } + private boolean expectConfigRequests(Node node) { return !node.type().isDockerHost() || configserverConfig.nodeAdminInContainer(); } @@ -302,12 +410,12 @@ public class NodeFailer extends Maintainer { if (failingTenantNode.state() == Node.State.active) { allTenantNodesFailedOutSuccessfully &= failActive(failingTenantNode, reasonForChildFailure); } else { - nodeRepository().fail(failingTenantNode.hostname(), Agent.system, reasonForChildFailure); + nodeRepository().fail(failingTenantNode.hostname(), Agent.NodeFailer, reasonForChildFailure); } } if (! allTenantNodesFailedOutSuccessfully) return false; - node = nodeRepository().fail(node.hostname(), Agent.system, reason); + node = nodeRepository().fail(node.hostname(), Agent.NodeFailer, reason); try { deployment.get().activate(); return true; @@ -315,7 +423,7 @@ public class NodeFailer extends Maintainer { catch (RuntimeException e) { // The expected reason for deployment to fail here is that there is no capacity available to redeploy. // In that case we should leave the node in the active state to avoid failing additional nodes. - nodeRepository().reactivate(node.hostname(), Agent.system, + nodeRepository().reactivate(node.hostname(), Agent.NodeFailer, "Failed to redeploy after being failed by NodeFailer"); log.log(Level.WARNING, "Attempted to fail " + node + " for " + node.allocation().get().owner() + ", but redeploying without the node failed", e); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 5774176956a..2c784de06ca 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -196,7 +196,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { if (zone.environment().equals(Environment.prod) && zone.system() != SystemName.cd) { inactiveExpiry = Duration.ofHours(4); // enough time for the application owner to discover and redeploy - retiredInterval = Duration.ofMinutes(29); + retiredInterval = Duration.ofMinutes(10); dirtyExpiry = Duration.ofHours(2); // enough time to clean the node retiredExpiry = Duration.ofDays(4); // give up migrating data after 4 days } else { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java index d437b90a7f7..812c370df5f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Agent.java @@ -7,5 +7,5 @@ package com.yahoo.vespa.hosted.provision.node; * @author bratseth */ public enum Agent { - system, application, operator, NodeRetirer + system, application, operator, NodeRetirer, NodeFailer } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java index 3ccf617ce60..1e4c4503b16 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Report.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.node; import com.yahoo.slime.Cursor; import com.yahoo.slime.Inspector; +import com.yahoo.slime.Slime; import com.yahoo.vespa.config.SlimeUtils; import java.time.Instant; @@ -22,14 +23,16 @@ public class Report { private final String reportId; private final Instant createdTime; + private final String description; // This is the serialized report which is typically richer than we know of in this class. It's up to // clients of specific reports to query the details. private final Inspector reportInspector; - private Report(String reportId, Instant createdTime, Inspector reportInspector) { + private Report(String reportId, Instant createdTime, String description, Inspector reportInspector) { this.reportId = reportId; this.createdTime = createdTime; + this.description = description; this.reportInspector = reportInspector; } @@ -45,13 +48,18 @@ public class Report { public boolean shouldFailNode() { return !getDescription().isEmpty(); } /** A textual summary of the report. */ - public String getDescription() { return reportInspector.field(DESCRIPTION_FIELD).asString(); } + public String getDescription() { return description; } /** For exploring the JSON (Slime) of the report. */ public Inspector getInspector() { return reportInspector; } + /** Create the simplest possible report. */ + public static Report basicReport(String reportId, Instant createdTime, String description) { + return new Report(reportId, createdTime, description, new Slime().setObject()); + } + /** The reportInspector will be used to serialize the full report later, including any createdTime and description. */ public static Report fromSlime(String reportId, Inspector reportInspector) { long millisSinceEpoch = reportInspector.field(CREATED_FIELD).asLong(); @@ -61,14 +69,18 @@ public class Report { } Instant createdTime = Instant.ofEpochMilli(millisSinceEpoch); - return new Report(reportId, createdTime, reportInspector); + String description = reportInspector.field(DESCRIPTION_FIELD).asString(); + + return new Report(reportId, createdTime, description, reportInspector); } public void toSlime(Cursor reportCursor) { SlimeUtils.copyObject(reportInspector, reportCursor); - // If the above inject inserted the created timestamp field, this is a no-op, which is what we want: - // We'd like the created field to be set the first time we see it, if it is not already set then. + // In Slime, trying to overwrite an already existing field is a no-op. + // We'll write the required fields now. If they weren't already set by the above copyObject, + // in particular the created field, the below will be set it to the current timestamp which is what we want. reportCursor.setLong(CREATED_FIELD, createdTime.toEpochMilli()); + if (!description.isEmpty()) reportCursor.setString(DESCRIPTION_FIELD, description); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Reports.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Reports.java index 721b9c1cbd9..fd6094ae111 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Reports.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Reports.java @@ -25,6 +25,7 @@ public class Reports { public boolean isEmpty() { return reports.isEmpty(); } public Optional<Report> getReport(String id) { return Optional.ofNullable(reports.get(id)); } public List<Report> getReports() { return List.copyOf(reports.values()); } + public Reports withReport(Report report) { return new Builder(this).setReport(report).build(); } public void toSlime(Cursor reportsParentObjectCursor, String reportsName) { if (reports.isEmpty()) return; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index 8f0f10cd9a1..e9dc186f122 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -333,6 +333,7 @@ public class NodeSerializer { case "system" : return Agent.system; case "operator" : return Agent.operator; case "NodeRetirer" : return Agent.NodeRetirer; + case "NodeFailer" : return Agent.NodeFailer; } throw new IllegalArgumentException("Unknown node event agent '" + eventAgentField.asString() + "'"); } @@ -342,6 +343,7 @@ public class NodeSerializer { case system : return "system"; case operator : return "operator"; case NodeRetirer : return "NodeRetirer"; + case NodeFailer : return "NodeFailer"; } throw new IllegalArgumentException("Serialized form of '" + agent + "' not defined"); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java index 29dd8646939..19e1958521a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodePatcher.java @@ -11,6 +11,7 @@ import com.yahoo.slime.Type; import com.yahoo.vespa.config.SlimeUtils; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.node.Report; import com.yahoo.vespa.hosted.provision.node.Reports; @@ -135,7 +136,7 @@ public class NodePatcher { case "additionalIpAddresses" : return node.withIpAddressPool(asStringSet(value)); case WANT_TO_RETIRE : - return node.withWantToRetire(asBoolean(value), nodeRepository.clock().instant()); + return node.withWantToRetire(asBoolean(value), Agent.operator, nodeRepository.clock().instant()); case WANT_TO_DEPROVISION : return node.with(node.status().withWantToDeprovision(asBoolean(value))); case "hardwareDivergence" : diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 2374eda8bca..3e022de6a5a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -8,9 +8,11 @@ import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Report; import org.junit.Test; import java.time.Duration; +import java.time.Instant; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -95,6 +97,96 @@ public class NodeFailerTest { } @Test + public void set_want_to_retire_if_failure_report() { + NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(6); + String dockerHostWithFailureReport = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2); + + // Set failure report to the parent and all its children. + Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low"); + tester.nodeRepository.getNodes().stream() + .filter(node -> node.hostname().equals(dockerHostWithFailureReport)) + .forEach(node -> { + Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport)); + tester.nodeRepository.write(updatedNode); + }); + + { + // The host is active + Node parentNode = tester.nodeRepository.getNode(dockerHostWithFailureReport).orElseThrow(); + assertEquals(Node.State.active, parentNode.state()); + assertEquals(1, parentNode.reports().getReports().size()); + assertFalse(parentNode.status().wantToRetire()); + + List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList(); + assertEquals(3, childNodes.size()); + + // The 2 active child nodes + List<Node> activeChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.active).collect(Collectors.toList()); + assertEquals(2, activeChildNodes.size()); + assertTrue(activeChildNodes.stream().noneMatch(n -> n.status().wantToRetire())); + + // The ready child node + List<Node> failedChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.ready).collect(Collectors.toList()); + assertEquals(1, failedChildNodes.size()); + assertTrue(activeChildNodes.stream().noneMatch(n -> n.status().wantToRetire())); + } + + tester.failer.run(); + + { + // The host is active with wantToRetire + Node parentNode = tester.nodeRepository.getNode(dockerHostWithFailureReport).orElseThrow(); + assertEquals(Node.State.active, parentNode.state()); + assertTrue(parentNode.status().wantToRetire()); + + List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList(); + assertEquals(3, childNodes.size()); + + // The active nodes -> wantToRetire + List<Node> activeChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.active).collect(Collectors.toList()); + assertEquals(2, activeChildNodes.size()); + assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire())); + + // The ready node -> failed with wantToRetire + List<Node> failedChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.failed).collect(Collectors.toList()); + assertEquals(1, failedChildNodes.size()); + assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire())); + } + + // Set wantToRetire on the second host. Rate limiting will keep it from becoming wantToRetire + + String dockerHost2 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, dockerHostWithFailureReport); + tester.nodeRepository.getNodes().stream() + .filter(node -> node.hostname().equals(dockerHost2)) + .forEach(node -> { + Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport)); + tester.nodeRepository.write(updatedNode); + }); + + { + // dockerHost2 is active and with reports + Node parentNode = tester.nodeRepository.getNode(dockerHost2).orElseThrow(); + assertEquals(Node.State.active, parentNode.state()); + assertEquals(1, parentNode.reports().getReports().size()); + assertFalse(parentNode.status().wantToRetire()); + + List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHost2).asList(); + assertEquals(3, childNodes.size()); + } + + tester.clock.advance(Duration.ofHours(25)); + tester.allNodesMakeAConfigRequestExcept(); + tester.failer.run(); + + { + // dockerHost2 is active with wantToRetire + Node parentNode = tester.nodeRepository.getNode(dockerHost2).orElseThrow(); + assertEquals(Node.State.active, parentNode.state()); + assertFalse(parentNode.status().wantToRetire()); + } + } + + @Test public void nodes_for_suspended_applications_are_not_failed() { NodeFailTester tester = NodeFailTester.withTwoApplications(); tester.suspend(NodeFailTester.app1); |