diff options
Diffstat (limited to 'node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java')
-rw-r--r-- | node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java | 90 |
1 files changed, 77 insertions, 13 deletions
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 94090b38cb7..6100e87c5ec 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -7,6 +7,7 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.slime.SlimeUtils; +import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; @@ -14,6 +15,7 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.Report; +import com.yahoo.vespa.orchestrator.status.HostStatus; import org.junit.Test; import java.time.Duration; @@ -170,8 +172,8 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(65)); tester.runMaintainers(); - assertTrue(tester.nodeRepository.nodes().node(host_from_normal_app).get().isDown()); - assertTrue(tester.nodeRepository.nodes().node(host_from_suspended_app).get().isDown()); + assertTrue(tester.nodeRepository.nodes().node(host_from_normal_app).get().history().isDown()); + assertTrue(tester.nodeRepository.nodes().node(host_from_suspended_app).get().history().isDown()); assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(host_from_normal_app).get().state()); assertEquals(Node.State.active, tester.nodeRepository.nodes().node(host_from_suspended_app).get().state()); } @@ -203,8 +205,10 @@ public class NodeFailerTest { String downHost1 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); String downHost2 = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app2).asList().get(3).hostname(); // No liveness evidence yet: - assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown()); - assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isUp()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isDown()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isUp()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isSuspended()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isResumed()); // For a day all nodes work so nothing happens for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) { @@ -214,8 +218,10 @@ public class NodeFailerTest { assertEquals(0, tester.deployer.activations); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown()); - assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isUp()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isDown()); + assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isUp()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isSuspended()); + assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isResumed()); } tester.serviceMonitor.setHostDown(downHost1); @@ -227,16 +233,16 @@ public class NodeFailerTest { assertEquals(0, tester.deployer.activations); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isDown()); - assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isUp()); + assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isDown()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isUp()); } tester.serviceMonitor.setHostUp(downHost1); // downHost2 should now be failed and replaced, but not downHost1 tester.clock.advance(Duration.ofDays(1)); tester.runMaintainers(); - assertFalse(tester.nodeRepository.nodes().node(downHost1).get().isDown()); - assertTrue(tester.nodeRepository.nodes().node(downHost1).get().isUp()); + assertFalse(tester.nodeRepository.nodes().node(downHost1).get().history().isDown()); + assertTrue(tester.nodeRepository.nodes().node(downHost1).get().history().isUp()); assertEquals(1, tester.deployer.activations); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); @@ -314,6 +320,64 @@ public class NodeFailerTest { } @Test + public void suspension_extends_grace_period() { + NodeFailTester tester = NodeFailTester.withTwoApplications(); + String downNode = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); + + // host down, but within 1h timeout + tester.serviceMonitor.setHostDown(downNode); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // 30m is still within 1h timeout + tester.clock.advance(Duration.ofMinutes(30)); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // suspend + tester.clock.advance(Duration.ofSeconds(5)); + tester.nodeRepository.orchestrator().setNodeStatus(new HostName(downNode), HostStatus.ALLOWED_TO_BE_DOWN); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // the timeout should now be 4h, so still ~3:30 left. + tester.clock.advance(Duration.ofHours(3)); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // advancing another hour takes us beyond the 4h timeout + tester.clock.advance(Duration.ofHours(1)); + tester.runMaintainers(); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(downNode).get().state()); + } + + @Test + public void suspension_defers_downtime() { + NodeFailTester tester = NodeFailTester.withTwoApplications(); + String downNode = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); + + // host suspends and goes down + tester.nodeRepository.orchestrator().setNodeStatus(new HostName(downNode), HostStatus.ALLOWED_TO_BE_DOWN); + tester.serviceMonitor.setHostDown(downNode); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // host resumes after 30m + tester.clock.advance(Duration.ofMinutes(30)); + tester.nodeRepository.orchestrator().setNodeStatus(new HostName(downNode), HostStatus.NO_REMARKS); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + + // the host should fail 1h after resume, not when the node goes down. Verify this + tester.clock.advance(Duration.ofMinutes(45)); + tester.runMaintainers(); + assertEquals(Node.State.active, tester.nodeRepository.nodes().node(downNode).get().state()); + tester.clock.advance(Duration.ofMinutes(30)); + tester.runMaintainers(); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(downNode).get().state()); + } + + @Test public void node_failing_can_allocate_spare() { var resources = new NodeResources(1, 20, 15, 1); Capacity capacity = Capacity.from(new ClusterResources(3, 1, resources), false, true); @@ -653,21 +717,21 @@ public class NodeFailerTest { tester.serviceMonitor.setHostDown(downHost); tester.runMaintainers(); node = tester.nodeRepository.nodes().node(downHost).get(); - assertTrue(node.isDown()); + assertTrue(node.history().isDown()); assertEquals(Node.State.active, node.state()); // CMR still ongoing, don't fail yet clock.advance(Duration.ofHours(1)); tester.runMaintainers(); node = tester.nodeRepository.nodes().node(downHost).get(); - assertTrue(node.isDown()); + assertTrue(node.history().isDown()); assertEquals(Node.State.active, node.state()); // No ongoing CMR anymore, host should be failed clock.advance(Duration.ofHours(1)); tester.runMaintainers(); node = tester.nodeRepository.nodes().node(downHost).get(); - assertTrue(node.isDown()); + assertTrue(node.history().isDown()); assertEquals(Node.State.failed, node.state()); } |