summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2019-02-18 21:33:54 +0100
committerHåkon Hallingstad <hakon@verizonmedia.com>2019-02-18 21:33:54 +0100
commitca276d29713c78d4fb8a9e510b682f53faae3b23 (patch)
treea9205f09813e179b21b46f6e16b81c1e80b7569c /node-repository
parentc3f34b6b1de9836d6d2175757a3081ffcc29a90c (diff)
Require all child nodes to be suspended in NodeFailer
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java12
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java63
2 files changed, 74 insertions, 1 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index c1c2d7b9cc5..d7a71580a46 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -210,7 +210,7 @@ public class NodeFailer extends Maintainer {
for (Node node : activeNodes) {
if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
- } else if (nodeSuspended(node)) {
+ } else if (hostSuspended(node, activeNodes)) {
if (node.status().hardwareFailureDescription().isPresent()) {
nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get());
} else {
@@ -296,6 +296,16 @@ public class NodeFailer extends Maintainer {
}
}
+ /** Is the node and all active children suspended? */
+ private boolean hostSuspended(Node node, List<Node> activeNodes) {
+ if (!nodeSuspended(node)) return false;
+ if (node.parentHostname().isPresent()) return true; // optimization
+ return activeNodes.stream()
+ .filter(childNode -> childNode.parentHostname().isPresent() &&
+ childNode.parentHostname().get().equals(node.hostname()))
+ .allMatch(this::nodeSuspended);
+ }
+
/**
* We can attempt to fail any number of *tenant* and *host* nodes because the operation will not be effected
* unless the node is replaced.
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index eaacf7c656b..4aa9fb2b11d 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -118,6 +118,69 @@ public class NodeFailerTest {
}
@Test
+ public void hw_fail_only_if_whole_host_is_suspended() {
+ NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(6);
+ String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2);
+ assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state());
+
+ // The host has 2 nodes in active and 1 ready
+ Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.list().childrenOf(hostWithFailureReports).asList().stream()
+ .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList())));
+ assertEquals(2, hostnamesByState.get(Node.State.active).size());
+ String activeChild1 = hostnamesByState.get(Node.State.active).get(0);
+ String activeChild2 = hostnamesByState.get(Node.State.active).get(1);
+ assertEquals(1, hostnamesByState.get(Node.State.ready).size());
+ String readyChild = hostnamesByState.get(Node.State.ready).get(0);
+
+ // Set failure report to the parent and all its children.
+ Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low");
+ tester.nodeRepository.getNodes().stream()
+ .filter(node -> node.hostname().equals(hostWithFailureReports))
+ .forEach(node -> {
+ Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport));
+ tester.nodeRepository.write(updatedNode);
+ });
+
+ // The ready node will be failed, but neither the host nor the 2 active nodes since they have not been suspended
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.failer.run();
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state());
+ assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state());
+ assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild1).get().state());
+ assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild2).get().state());
+
+ // Suspending the host will not fail any more since none of the children are suspened
+ tester.suspend(hostWithFailureReports);
+ tester.clock.advance(Duration.ofHours(25));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.failer.run();
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state());
+ assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state());
+ assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild1).get().state());
+ assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild2).get().state());
+
+ // Suspending one child node will fail that out.
+ tester.suspend(activeChild1);
+ tester.clock.advance(Duration.ofHours(25));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.failer.run();
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state());
+ assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild1).get().state());
+ assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild2).get().state());
+
+ // Suspending the second child node will fail that out and the host.
+ tester.suspend(activeChild2);
+ tester.clock.advance(Duration.ofHours(25));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.failer.run();
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(hostWithFailureReports).get().state());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild1).get().state());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild2).get().state());
+ }
+
+ @Test
public void nodes_for_suspended_applications_are_not_failed() {
NodeFailTester tester = NodeFailTester.withTwoApplications();
tester.suspend(NodeFailTester.app1);