summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorvalerijf <valerijf@oath.com>2017-08-21 08:44:53 +0200
committervalerijf <valerijf@oath.com>2017-08-21 08:44:53 +0200
commite0ac6fb3d6e32dec0e986d79f789e45f62465dcd (patch)
tree07c2a7dab24a0124fc7453d992d71b4dac2333ac /node-repository
parentd2b581843a0a4de6662e582d280c44d540c15029 (diff)
Fix HW fail count precedence
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java4
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java32
2 files changed, 18 insertions, 18 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index 1271f21e06e..77b1dc712df 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -54,8 +54,6 @@ public class FailedExpirer extends Expirer {
protected void expire(List<Node> expired) {
List<Node> nodesToRecycle = new ArrayList<>();
for (Node recycleCandidate : expired) {
- if (failCountIndicatesHwFail(zone, recycleCandidate) && recycleCandidate.status().failCount() >= 5) continue;
-
if (recycleCandidate.status().hardwareFailureDescription().isPresent()) {
boolean shouldBeParked = recycleCandidate.type() != NodeType.host ||
nodeRepository.getChildNodes(recycleCandidate.hostname()).stream()
@@ -63,7 +61,7 @@ public class FailedExpirer extends Expirer {
if (shouldBeParked) nodeRepository.park(
recycleCandidate.hostname(), Agent.system, "Parked by FailedExpirer due to HW failure on node");
- } else {
+ } else if (! failCountIndicatesHwFail(zone, recycleCandidate) || recycleCandidate.status().failCount() < 5) {
nodesToRecycle.add(recycleCandidate);
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
index f44b78651b5..3ccacb3ff02 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
@@ -56,8 +56,7 @@ public class FailedExpirerTest {
failedExpirer.run();
assertNodeHostnames(Node.State.failed, "node1");
- assertNodeHostnames(Node.State.parked, "node2");
- assertNodeHostnames(Node.State.dirty, "node3");
+ assertNodeHostnames(Node.State.parked, "node2", "node3");
}
@Test
@@ -66,8 +65,8 @@ public class FailedExpirerTest {
clock.advance(Duration.ofDays(5));
failedExpirer.run();
- assertNodeHostnames(Node.State.parked, "node2");
- assertNodeHostnames(Node.State.dirty, "node1", "node3");
+ assertNodeHostnames(Node.State.parked, "node2", "node3");
+ assertNodeHostnames(Node.State.dirty, "node1");
}
@Test
@@ -76,8 +75,8 @@ public class FailedExpirerTest {
clock.advance(Duration.ofDays(5));
failedExpirer.run();
- assertNodeHostnames(Node.State.parked, "node2");
- assertNodeHostnames(Node.State.dirty, "node1", "node3");
+ assertNodeHostnames(Node.State.parked, "node2", "node3");
+ assertNodeHostnames(Node.State.dirty, "node1");
}
@Test
@@ -86,12 +85,12 @@ public class FailedExpirerTest {
clock.advance(Duration.ofDays(5));
failedExpirer.run();
- assertNodeHostnames(Node.State.parked, "node2");
- assertNodeHostnames(Node.State.dirty, "node1", "node3");
+ assertNodeHostnames(Node.State.parked, "node2", "node3");
+ assertNodeHostnames(Node.State.dirty, "node1");
}
@Test
- public void ensure_failed_docker_host_is_parked() throws InterruptedException {
+ public void ensure_parked_docker_host() throws InterruptedException {
failureScenarioIn(SystemName.main, Environment.prod, "docker");
failNode("parent2");
@@ -101,12 +100,12 @@ public class FailedExpirerTest {
failedExpirer.run(); // Run twice because parent can only be parked after the child
failedExpirer.run();
- assertNodeHostnames(Node.State.parked, "parent2", "node2");
+ assertNodeHostnames(Node.State.parked, "parent2", "node2", "node3");
}
@Test
public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() throws InterruptedException {
- failureScenarioIn(SystemName.main, Environment.prod, "docker");
+ failureScenarioIn(SystemName.cd, Environment.prod, "docker");
failNode("parent1");
setHWFailureForNode("parent1");
@@ -163,15 +162,18 @@ public class FailedExpirerTest {
// Set node1 to have failed 4 times before
Node node1 = nodeRepository.getNode("node1").get();
- node1 = node1.with(node1.status().withIncreasedFailCount());
- node1 = node1.with(node1.status().withIncreasedFailCount());
- node1 = node1.with(node1.status().withIncreasedFailCount());
- node1 = node1.with(node1.status().withIncreasedFailCount());
+ node1 = node1.with(node1.status().setFailCount(4));
nodeRepository.write(node1);
// Set node2 to have a detected hardware failure
setHWFailureForNode("node2");
+ // Set node3 to have failed 8 times before and have a HW failure
+ Node node3 = nodeRepository.getNode("node3").get();
+ node3 = node1.with(node3.status().setFailCount(8));
+ nodeRepository.write(node3);
+ setHWFailureForNode("node3");
+
// Allocate the nodes
List<Node> provisioned = nodeRepository.getNodes(NodeType.tenant, Node.State.provisioned);
nodeRepository.setReady(nodeRepository.setDirty(provisioned));