summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2020-12-02 18:07:52 +0100
committerGitHub <noreply@github.com>2020-12-02 18:07:52 +0100
commit4d4c4fb9c9a3d34c0b419b260e1381a8fbb88181 (patch)
treecdf4cffc179bc362fe357df3a1a6dfbd334ec207 /node-repository
parent47da765f0f91d9f9622507770a1e04f2b361a010 (diff)
parent3dacae51de884547a1ed0e9c78154789a88c0a83 (diff)
Merge pull request #15595 from vespa-engine/freva/grace-after-activate
Allow a grace period after node re-activation
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java4
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java33
2 files changed, 35 insertions, 2 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 6d571fada9e..2999655e5fa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -153,7 +153,9 @@ public class NodeFailer extends NodeRepositoryMaintainer {
Map<Node, String> nodesByFailureReason = new HashMap<>();
for (Node node : activeNodes) {
if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
- nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
+ // Allow a grace period after node re-activation
+ if ( ! node.history().hasEventAfter(History.Event.Type.activated, graceTimeEnd))
+ nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
}
else if (hostSuspended(node, activeNodes)) {
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index d403affc292..d4dbc6f55a5 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -10,6 +10,7 @@ import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Report;
import com.yahoo.vespa.hosted.provision.node.Reports;
import org.junit.Test;
@@ -233,7 +234,7 @@ public class NodeFailerTest {
assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail1.hostname()).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail2.hostname()).get().state());
-
+
String downHost1 = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname();
String downHost2 = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname();
tester.serviceMonitor.setHostDown(downHost1);
@@ -309,6 +310,36 @@ public class NodeFailerTest {
}
@Test
+ public void re_activate_grace_period_test() {
+ NodeFailTester tester = NodeFailTester.withTwoApplications();
+ String downNode = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname();
+
+ tester.serviceMonitor.setHostDown(downNode);
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+
+ tester.clock.advance(Duration.ofMinutes(75));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state());
+
+ // Re-activate the node. It is still down, but should not be failed out until the grace period has passed again
+ tester.nodeRepository.reactivate(downNode, Agent.system, getClass().getSimpleName());
+ tester.clock.advance(Duration.ofMinutes(30));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+
+ tester.clock.advance(Duration.ofMinutes(45));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state());
+ }
+
+ @Test
public void node_failing_can_allocate_spare() {
var resources = new NodeResources(1, 20, 15, 1);
Capacity capacity = Capacity.from(new ClusterResources(3, 1, resources), false, true);