diff options
author | Valerij Fredriksen <valerij92@gmail.com> | 2020-12-02 13:31:14 +0100 |
---|---|---|
committer | Valerij Fredriksen <valerij92@gmail.com> | 2020-12-02 13:31:14 +0100 |
commit | 3dacae51de884547a1ed0e9c78154789a88c0a83 (patch) | |
tree | 259aa5977237e13348f8ce2931b5c9b2982c6033 /node-repository | |
parent | fdbcea7e4213e5e452729f133afe85939fe4271c (diff) |
Allow a grace period after node re-activation
Diffstat (limited to 'node-repository')
2 files changed, 35 insertions, 2 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 6d571fada9e..2999655e5fa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -153,7 +153,9 @@ public class NodeFailer extends NodeRepositoryMaintainer { Map<Node, String> nodesByFailureReason = new HashMap<>(); for (Node node : activeNodes) { if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) { - nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); + // Allow a grace period after node re-activation + if ( ! node.history().hasEventAfter(History.Event.Type.activated, graceTimeEnd)) + nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); } else if (hostSuspended(node, activeNodes)) { Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index d403affc292..d4dbc6f55a5 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -10,6 +10,7 @@ import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.Report; import com.yahoo.vespa.hosted.provision.node.Reports; import org.junit.Test; @@ -233,7 +234,7 @@ public class NodeFailerTest { assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail1.hostname()).get().state()); assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail2.hostname()).get().state()); - + String downHost1 = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname(); String downHost2 = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname(); tester.serviceMonitor.setHostDown(downHost1); @@ -309,6 +310,36 @@ public class NodeFailerTest { } @Test + public void re_activate_grace_period_test() { + NodeFailTester tester = NodeFailTester.withTwoApplications(); + String downNode = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname(); + + tester.serviceMonitor.setHostDown(downNode); + tester.allNodesMakeAConfigRequestExcept(); + tester.runMaintainers(); + assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + + tester.clock.advance(Duration.ofMinutes(75)); + tester.allNodesMakeAConfigRequestExcept(); + tester.runMaintainers(); + assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state()); + + // Re-activate the node. It is still down, but should not be failed out until the grace period has passed again + tester.nodeRepository.reactivate(downNode, Agent.system, getClass().getSimpleName()); + tester.clock.advance(Duration.ofMinutes(30)); + tester.allNodesMakeAConfigRequestExcept(); + tester.runMaintainers(); + assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + + tester.clock.advance(Duration.ofMinutes(45)); + tester.allNodesMakeAConfigRequestExcept(); + tester.runMaintainers(); + assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state()); + } + + @Test public void node_failing_can_allocate_spare() { var resources = new NodeResources(1, 20, 15, 1); Capacity capacity = Capacity.from(new ClusterResources(3, 1, resources), false, true); |