summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2019-02-12 22:09:23 +0100
committerHåkon Hallingstad <hakon@verizonmedia.com>2019-02-12 22:09:23 +0100
commitdb7fafdf45dca3e1ae83e32c1332f5d3669a6d51 (patch)
treefc36aadc181aacd2bb7ef24ede43d33197583297 /node-repository
parenta89a4821cdfd14619ee2e6029d3461bf6e6b2de1 (diff)
Max 1 active host with wantToRetire, and fix NodeFailer.hasHardwareIssue
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java29
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java64
3 files changed, 74 insertions, 21 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index 621ba2f778f..52ef5b4d1c6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -99,7 +99,7 @@ public class FailedExpirer extends Maintainer {
private void recycle(List<Node> nodes) {
List<Node> nodesToRecycle = new ArrayList<>();
for (Node candidate : nodes) {
- if (NodeFailer.hasHardwareIssue(candidate, nodes)) {
+ if (NodeFailer.hasHardwareIssue(candidate, nodeRepository)) {
List<String> unparkedChildren = !candidate.type().isDockerHost() ? Collections.emptyList() :
nodeRepository.list().childrenOf(candidate).asList().stream()
.filter(node -> node.state() != Node.State.parked)
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index d0b1beb8e28..a8665595f97 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -133,14 +133,23 @@ public class NodeFailer extends Maintainer {
failActive(node, reason);
}
- // Retire active nodes
- for (Node node : activeNodes) {
- if (!failAllowedFor(node.type())) continue;
- if (nodesWithFailureReason.contains(node)) continue;
- if (node.parentHostname().isPresent()) continue; // Defer to parent host (it should be active too)
+ // Retire active nodes. Only allow 1 active host to be wantToRetire at a time for rate limiting.
+ final long maxWantToRetireHosts = 1;
+ List<Node> candidateNodes = activeNodes.stream()
+ .filter(node -> failAllowedFor(node.type()))
+ .filter(node -> !nodesWithFailureReason.contains(node))
+ // Defer to parent host (it should also be active)
+ .filter(node -> node.parentHostname().isEmpty())
+ .collect(Collectors.toList());
+ long currentWantToRetireHosts = candidateNodes.stream().filter(node -> node.status().wantToRetire()).count();
+
+ for (int i = 0; i < candidateNodes.size() && currentWantToRetireHosts < maxWantToRetireHosts; ++i) {
+ Node node = candidateNodes.get(i);
List<String> reasons = reasonsToRetireActiveParentHost(node);
- if (reasons.isEmpty()) continue;
- retireRecursively(node, reasons, activeNodes);
+ if (reasons.size() > 0) {
+ retireRecursively(node, reasons, activeNodes);
+ ++currentWantToRetireHosts;
+ }
}
metric.set(throttlingActiveMetric, Math.min( 1, throttledNodeFailures), null);
@@ -286,14 +295,12 @@ public class NodeFailer extends Maintainer {
}
/** Returns whether node has any kind of hardware issue */
- public static boolean hasHardwareIssue(Node node, List<Node> nodes) {
+ public static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) {
if (node.status().hardwareFailureDescription().isPresent() || node.status().hardwareDivergence().isPresent()) {
return true;
}
- Node hostNode = node.parentHostname()
- .flatMap(parent -> nodes.stream().filter(n -> n.hostname().equals(parent)).findFirst())
- .orElse(node);
+ Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.getNode(parent)).orElse(node);
return reasonsToRetireActiveParentHost(hostNode).size() > 0;
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index b0466885931..3e022de6a5a 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -111,20 +111,34 @@ public class NodeFailerTest {
});
{
- // The host should have 2 nodes in active and 1 ready
+ // The host is active
+ Node parentNode = tester.nodeRepository.getNode(dockerHostWithFailureReport).orElseThrow();
+ assertEquals(Node.State.active, parentNode.state());
+ assertEquals(1, parentNode.reports().getReports().size());
+ assertFalse(parentNode.status().wantToRetire());
+
List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList();
- Map<Node.State, List<String>> hostnamesByState = childNodes.stream()
- .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList())));
- assertEquals(2, hostnamesByState.get(Node.State.active).size());
- assertEquals(1, hostnamesByState.get(Node.State.ready).size());
+ assertEquals(3, childNodes.size());
+
+ // The 2 active child nodes
+ List<Node> activeChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.active).collect(Collectors.toList());
+ assertEquals(2, activeChildNodes.size());
+ assertTrue(activeChildNodes.stream().noneMatch(n -> n.status().wantToRetire()));
+
+ // The ready child node
+ List<Node> failedChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.ready).collect(Collectors.toList());
+ assertEquals(1, failedChildNodes.size());
+ assertTrue(activeChildNodes.stream().noneMatch(n -> n.status().wantToRetire()));
}
tester.failer.run();
- tester.clock.advance(Duration.ofHours(25));
- tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
{
+ // The host is active with wantToRetire
+ Node parentNode = tester.nodeRepository.getNode(dockerHostWithFailureReport).orElseThrow();
+ assertEquals(Node.State.active, parentNode.state());
+ assertTrue(parentNode.status().wantToRetire());
+
List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList();
assertEquals(3, childNodes.size());
@@ -133,11 +147,43 @@ public class NodeFailerTest {
assertEquals(2, activeChildNodes.size());
assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire()));
- // The ready node -> failed
+ // The ready node -> failed with wantToRetire
List<Node> failedChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.failed).collect(Collectors.toList());
assertEquals(1, failedChildNodes.size());
assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire()));
}
+
+ // Set wantToRetire on the second host. Rate limiting will keep it from becoming wantToRetire
+
+ String dockerHost2 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, dockerHostWithFailureReport);
+ tester.nodeRepository.getNodes().stream()
+ .filter(node -> node.hostname().equals(dockerHost2))
+ .forEach(node -> {
+ Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport));
+ tester.nodeRepository.write(updatedNode);
+ });
+
+ {
+ // dockerHost2 is active and with reports
+ Node parentNode = tester.nodeRepository.getNode(dockerHost2).orElseThrow();
+ assertEquals(Node.State.active, parentNode.state());
+ assertEquals(1, parentNode.reports().getReports().size());
+ assertFalse(parentNode.status().wantToRetire());
+
+ List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHost2).asList();
+ assertEquals(3, childNodes.size());
+ }
+
+ tester.clock.advance(Duration.ofHours(25));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.failer.run();
+
+ {
+ // dockerHost2 is active with wantToRetire
+ Node parentNode = tester.nodeRepository.getNode(dockerHost2).orElseThrow();
+ assertEquals(Node.State.active, parentNode.state());
+ assertFalse(parentNode.status().wantToRetire());
+ }
}
@Test