diff options
author | valerijf <valerijf@oath.com> | 2017-08-21 08:44:53 +0200 |
---|---|---|
committer | valerijf <valerijf@oath.com> | 2017-08-21 08:44:53 +0200 |
commit | e0ac6fb3d6e32dec0e986d79f789e45f62465dcd (patch) | |
tree | 07c2a7dab24a0124fc7453d992d71b4dac2333ac /node-repository/src | |
parent | d2b581843a0a4de6662e582d280c44d540c15029 (diff) |
Fix HW fail count precedence
Diffstat (limited to 'node-repository/src')
2 files changed, 18 insertions, 18 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index 1271f21e06e..77b1dc712df 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -54,8 +54,6 @@ public class FailedExpirer extends Expirer { protected void expire(List<Node> expired) { List<Node> nodesToRecycle = new ArrayList<>(); for (Node recycleCandidate : expired) { - if (failCountIndicatesHwFail(zone, recycleCandidate) && recycleCandidate.status().failCount() >= 5) continue; - if (recycleCandidate.status().hardwareFailureDescription().isPresent()) { boolean shouldBeParked = recycleCandidate.type() != NodeType.host || nodeRepository.getChildNodes(recycleCandidate.hostname()).stream() @@ -63,7 +61,7 @@ public class FailedExpirer extends Expirer { if (shouldBeParked) nodeRepository.park( recycleCandidate.hostname(), Agent.system, "Parked by FailedExpirer due to HW failure on node"); - } else { + } else if (! failCountIndicatesHwFail(zone, recycleCandidate) || recycleCandidate.status().failCount() < 5) { nodesToRecycle.add(recycleCandidate); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java index f44b78651b5..3ccacb3ff02 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java @@ -56,8 +56,7 @@ public class FailedExpirerTest { failedExpirer.run(); assertNodeHostnames(Node.State.failed, "node1"); - assertNodeHostnames(Node.State.parked, "node2"); - assertNodeHostnames(Node.State.dirty, "node3"); + assertNodeHostnames(Node.State.parked, "node2", "node3"); } @Test @@ -66,8 +65,8 @@ public class FailedExpirerTest { clock.advance(Duration.ofDays(5)); failedExpirer.run(); - assertNodeHostnames(Node.State.parked, "node2"); - assertNodeHostnames(Node.State.dirty, "node1", "node3"); + assertNodeHostnames(Node.State.parked, "node2", "node3"); + assertNodeHostnames(Node.State.dirty, "node1"); } @Test @@ -76,8 +75,8 @@ public class FailedExpirerTest { clock.advance(Duration.ofDays(5)); failedExpirer.run(); - assertNodeHostnames(Node.State.parked, "node2"); - assertNodeHostnames(Node.State.dirty, "node1", "node3"); + assertNodeHostnames(Node.State.parked, "node2", "node3"); + assertNodeHostnames(Node.State.dirty, "node1"); } @Test @@ -86,12 +85,12 @@ public class FailedExpirerTest { clock.advance(Duration.ofDays(5)); failedExpirer.run(); - assertNodeHostnames(Node.State.parked, "node2"); - assertNodeHostnames(Node.State.dirty, "node1", "node3"); + assertNodeHostnames(Node.State.parked, "node2", "node3"); + assertNodeHostnames(Node.State.dirty, "node1"); } @Test - public void ensure_failed_docker_host_is_parked() throws InterruptedException { + public void ensure_parked_docker_host() throws InterruptedException { failureScenarioIn(SystemName.main, Environment.prod, "docker"); failNode("parent2"); @@ -101,12 +100,12 @@ public class FailedExpirerTest { failedExpirer.run(); // Run twice because parent can only be parked after the child failedExpirer.run(); - assertNodeHostnames(Node.State.parked, "parent2", "node2"); + assertNodeHostnames(Node.State.parked, "parent2", "node2", "node3"); } @Test public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() throws InterruptedException { - failureScenarioIn(SystemName.main, Environment.prod, "docker"); + failureScenarioIn(SystemName.cd, Environment.prod, "docker"); failNode("parent1"); setHWFailureForNode("parent1"); @@ -163,15 +162,18 @@ public class FailedExpirerTest { // Set node1 to have failed 4 times before Node node1 = nodeRepository.getNode("node1").get(); - node1 = node1.with(node1.status().withIncreasedFailCount()); - node1 = node1.with(node1.status().withIncreasedFailCount()); - node1 = node1.with(node1.status().withIncreasedFailCount()); - node1 = node1.with(node1.status().withIncreasedFailCount()); + node1 = node1.with(node1.status().setFailCount(4)); nodeRepository.write(node1); // Set node2 to have a detected hardware failure setHWFailureForNode("node2"); + // Set node3 to have failed 8 times before and have a HW failure + Node node3 = nodeRepository.getNode("node3").get(); + node3 = node1.with(node3.status().setFailCount(8)); + nodeRepository.write(node3); + setHWFailureForNode("node3"); + // Allocate the nodes List<Node> provisioned = nodeRepository.getNodes(NodeType.tenant, Node.State.provisioned); nodeRepository.setReady(nodeRepository.setDirty(provisioned)); |