diff options
Diffstat (limited to 'node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java')
-rw-r--r-- | node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java | 288 |
1 files changed, 144 insertions, 144 deletions
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index d4dbc6f55a5..50b99afbca5 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -51,11 +51,11 @@ public class NodeFailerTest { String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2); // Set failure report to the parent and all its children. - tester.nodeRepository.getNodes().stream() + tester.nodeRepository.nodes().getNodes().stream() .filter(node -> node.hostname().equals(hostWithFailureReports)) .forEach(node -> { Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport)); - tester.nodeRepository.write(updatedNode, () -> {}); + tester.nodeRepository.nodes().write(updatedNode, () -> {}); }); testNodeFailingWith(tester, hostWithFailureReports); @@ -63,7 +63,7 @@ public class NodeFailerTest { private void testNodeFailingWith(NodeFailTester tester, String hostWithHwFailure) { // The host should have 2 nodes in active and 1 ready - Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream() + Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.nodes().list().childrenOf(hostWithHwFailure).asList().stream() .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); assertEquals(2, hostnamesByState.get(Node.State.active).size()); assertEquals(1, hostnamesByState.get(Node.State.ready).size()); @@ -80,7 +80,7 @@ public class NodeFailerTest { Map<Node.State, List<String>> expectedHostnamesByState1Iter = Map.of( Node.State.failed, List.of(hostnamesByState.get(Node.State.ready).get(0), hostnamesByState.get(Node.State.active).get(0)), Node.State.active, hostnamesByState.get(Node.State.active).subList(1, 2)); - Map<Node.State, List<String>> hostnamesByState1Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream() + Map<Node.State, List<String>> hostnamesByState1Iter = tester.nodeRepository.nodes().list().childrenOf(hostWithHwFailure).asList().stream() .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); assertEquals(expectedHostnamesByState1Iter, hostnamesByState1Iter); @@ -92,26 +92,26 @@ public class NodeFailerTest { tester.runMaintainers(); // All of the children should be failed now - Set<Node.State> childStates2Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream() + Set<Node.State> childStates2Iter = tester.nodeRepository.nodes().list().childrenOf(hostWithHwFailure).asList().stream() .map(Node::state).collect(Collectors.toSet()); assertEquals(Set.of(Node.State.failed), childStates2Iter); // The host itself is still active as it too must be allowed to suspend - assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithHwFailure).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.nodes().getNode(hostWithHwFailure).get().state()); tester.suspend(hostWithHwFailure); tester.runMaintainers(); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(hostWithHwFailure).get().state()); - assertEquals(4, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(hostWithHwFailure).get().state()); + assertEquals(4, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); } @Test public void hw_fail_only_if_whole_host_is_suspended() { NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(6); String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2); - assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.nodes().getNode(hostWithFailureReports).get().state()); // The host has 2 nodes in active and 1 ready - Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.list().childrenOf(hostWithFailureReports).asList().stream() + Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.nodes().list().childrenOf(hostWithFailureReports).asList().stream() .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); assertEquals(2, hostnamesByState.get(Node.State.active).size()); String activeChild1 = hostnamesByState.get(Node.State.active).get(0); @@ -121,50 +121,50 @@ public class NodeFailerTest { // Set failure report to the parent and all its children. Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", HARD_FAIL, Instant.now(), "too low"); - tester.nodeRepository.getNodes().stream() + tester.nodeRepository.nodes().getNodes().stream() .filter(node -> node.hostname().equals(hostWithFailureReports)) .forEach(node -> { Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport)); - tester.nodeRepository.write(updatedNode, () -> {}); + tester.nodeRepository.nodes().write(updatedNode, () -> {}); }); // The ready node will be failed, but neither the host nor the 2 active nodes since they have not been suspended tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); - assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); - assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild1).get().state()); - assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild2).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(readyChild).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.nodes().getNode(hostWithFailureReports).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.nodes().getNode(activeChild1).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.nodes().getNode(activeChild2).get().state()); // Suspending the host will not fail any more since none of the children are suspened tester.suspend(hostWithFailureReports); tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); - assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); - assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild1).get().state()); - assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild2).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(readyChild).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.nodes().getNode(hostWithFailureReports).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.nodes().getNode(activeChild1).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.nodes().getNode(activeChild2).get().state()); // Suspending one child node will fail that out. tester.suspend(activeChild1); tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); - assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild1).get().state()); - assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild2).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(readyChild).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.nodes().getNode(hostWithFailureReports).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(activeChild1).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.nodes().getNode(activeChild2).get().state()); // Suspending the second child node will fail that out and the host. tester.suspend(activeChild2); tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild1).get().state()); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild2).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(readyChild).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(hostWithFailureReports).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(activeChild1).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(activeChild2).get().state()); } @Test @@ -173,39 +173,39 @@ public class NodeFailerTest { tester.suspend(NodeFailTester.app1); // Set two nodes down (one for each application) and wait 65 minutes - String host_from_suspended_app = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname(); - String host_from_normal_app = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname(); + String host_from_suspended_app = tester.nodeRepository.nodes().getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname(); + String host_from_normal_app = tester.nodeRepository.nodes().getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname(); tester.serviceMonitor.setHostDown(host_from_suspended_app); tester.serviceMonitor.setHostDown(host_from_normal_app); tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(65)); tester.runMaintainers(); - assertTrue(tester.nodeRepository.getNode(host_from_normal_app).get().isDown()); - assertTrue(tester.nodeRepository.getNode(host_from_suspended_app).get().isDown()); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(host_from_normal_app).get().state()); - assertEquals(Node.State.active, tester.nodeRepository.getNode(host_from_suspended_app).get().state()); + assertTrue(tester.nodeRepository.nodes().getNode(host_from_normal_app).get().isDown()); + assertTrue(tester.nodeRepository.nodes().getNode(host_from_suspended_app).get().isDown()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(host_from_normal_app).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.nodes().getNode(host_from_suspended_app).get().state()); } @Test public void zone_is_not_working_if_too_many_nodes_down() { NodeFailTester tester = NodeFailTester.withTwoApplications(); - tester.serviceMonitor.setHostDown(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(0).hostname()); + tester.serviceMonitor.setHostDown(tester.nodeRepository.nodes().getNodes(NodeFailTester.app1, Node.State.active).get(0).hostname()); tester.runMaintainers(); - assertTrue(tester.nodeRepository.isWorking()); + assertTrue(tester.nodeRepository.nodes().isWorking()); - tester.serviceMonitor.setHostDown(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname()); + tester.serviceMonitor.setHostDown(tester.nodeRepository.nodes().getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname()); tester.runMaintainers(); - assertTrue(tester.nodeRepository.isWorking()); + assertTrue(tester.nodeRepository.nodes().isWorking()); - tester.serviceMonitor.setHostDown(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(2).hostname()); + tester.serviceMonitor.setHostDown(tester.nodeRepository.nodes().getNodes(NodeFailTester.app1, Node.State.active).get(2).hostname()); tester.runMaintainers(); - assertFalse(tester.nodeRepository.isWorking()); + assertFalse(tester.nodeRepository.nodes().isWorking()); tester.clock.advance(Duration.ofMinutes(65)); tester.runMaintainers(); - assertTrue("Node failing is deactivated", tester.nodeRepository.list(Node.State.failed).isEmpty()); + assertTrue("Node failing is deactivated", tester.nodeRepository.nodes().list(Node.State.failed).isEmpty()); } @Test @@ -219,24 +219,24 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(); assertEquals( 0, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals( 0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(12, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals( 0, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals( 4, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); } // Hardware failures are detected on two ready nodes, which are then failed - Node readyFail1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(2); - Node readyFail2 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(3); - tester.nodeRepository.write(readyFail1.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); - tester.nodeRepository.write(readyFail2.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); - assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - tester.runMaintainers(); - assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail1.hostname()).get().state()); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail2.hostname()).get().state()); - - String downHost1 = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname(); - String downHost2 = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname(); + Node readyFail1 = tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).get(2); + Node readyFail2 = tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).get(3); + tester.nodeRepository.nodes().write(readyFail1.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); + tester.nodeRepository.nodes().write(readyFail2.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); + assertEquals(4, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); + tester.runMaintainers(); + assertEquals(2, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(readyFail1.hostname()).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(readyFail2.hostname()).get().state()); + + String downHost1 = tester.nodeRepository.nodes().getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname(); + String downHost2 = tester.nodeRepository.nodes().getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname(); tester.serviceMonitor.setHostDown(downHost1); tester.serviceMonitor.setHostDown(downHost2); // nothing happens the first 45 minutes @@ -245,9 +245,9 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); assertEquals( 0, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(12, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals( 2, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals( 2, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); } tester.serviceMonitor.setHostUp(downHost1); @@ -256,10 +256,10 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals( 1, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals( 1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals(downHost2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).get(0).hostname()); + assertEquals(12, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals( 3, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals( 1, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(downHost2, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).get(0).hostname()); // downHost1 fails again tester.serviceMonitor.setHostDown(downHost1); @@ -275,12 +275,12 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals( 2, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals( 0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(12, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals( 4, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals( 0, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); // the last host goes down - Node lastNode = tester.highestIndex(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active)); + Node lastNode = tester.highestIndex(tester.nodeRepository.nodes().getNodes(NodeFailTester.app1, Node.State.active)); tester.serviceMonitor.setHostDown(lastNode.hostname()); // it is not failed because there are no ready nodes to replace it for (int minutes = 0; minutes < 75; minutes +=5 ) { @@ -288,9 +288,9 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); assertEquals( 2, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals( 0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(12, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals( 4, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals( 0, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); } // A new node is available @@ -300,11 +300,11 @@ public class NodeFailerTest { tester.runMaintainers(); // The node is now failed assertEquals( 3, tester.deployer.redeployments); - assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals( 5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals( 0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(12, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals( 5, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals( 0, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); assertTrue("The index of the last failed node is not reused", - tester.highestIndex(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active)).allocation().get().membership().index() + tester.highestIndex(tester.nodeRepository.nodes().getNodes(NodeFailTester.app1, Node.State.active)).allocation().get().membership().index() > lastNode.allocation().get().membership().index()); } @@ -312,31 +312,31 @@ public class NodeFailerTest { @Test public void re_activate_grace_period_test() { NodeFailTester tester = NodeFailTester.withTwoApplications(); - String downNode = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname(); + String downNode = tester.nodeRepository.nodes().getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname(); tester.serviceMonitor.setHostDown(downNode); tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(0, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); tester.clock.advance(Duration.ofMinutes(75)); tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state()); + assertEquals(1, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(downNode).get().state()); // Re-activate the node. It is still down, but should not be failed out until the grace period has passed again - tester.nodeRepository.reactivate(downNode, Agent.system, getClass().getSimpleName()); + tester.nodeRepository.nodes().reactivate(downNode, Agent.system, getClass().getSimpleName()); tester.clock.advance(Duration.ofMinutes(30)); tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(0, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); tester.clock.advance(Duration.ofMinutes(45)); tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state()); + assertEquals(1, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(Node.State.failed, tester.nodeRepository.nodes().getNode(downNode).get().state()); } @Test @@ -349,7 +349,7 @@ public class NodeFailerTest { ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("test")).vespaVersion("6.42").build(); tester.activate(NodeFailTester.app1, cluster, capacity); - String downHost = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(0).hostname(); + String downHost = tester.nodeRepository.nodes().getNodes(NodeFailTester.app1, Node.State.active).get(0).hostname(); tester.serviceMonitor.setHostDown(downHost); // nothing happens the first 45 minutes @@ -358,8 +358,8 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); assertEquals(0, tester.deployer.redeployments); - assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(3, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(0, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); } // downHost should now be failed and replaced @@ -367,9 +367,9 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); assertEquals(1, tester.deployer.redeployments); - assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals(downHost, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).get(0).hostname()); + assertEquals(1, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(3, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(downHost, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).get(0).hostname()); } @Test @@ -385,10 +385,10 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals( 5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals( 5, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); } - List<Node> ready = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready); + List<Node> ready = tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready); // Two ready nodes and a ready docker node die, but only 2 of those are failed out tester.clock.advance(Duration.ofMinutes(180)); @@ -398,16 +398,16 @@ public class NodeFailerTest { .collect(Collectors.toList()); tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), dockerNode); tester.runMaintainers(); - assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals( 3, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals( 2, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); // Another ready node dies and the node that died earlier, are allowed to fail tester.clock.advance(Duration.ofDays(1)); tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), dockerNode, otherNodes.get(3)); tester.runMaintainers(); - assertEquals( 1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals(otherNodes.get(1), tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(0)); - assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals( 1, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(otherNodes.get(1), tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).get(0)); + assertEquals( 4, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); } @Test @@ -419,17 +419,17 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals( 3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size()); - assertEquals( 0, tester.nodeRepository.getNodes(NodeType.host, Node.State.failed).size()); + assertEquals( 3, tester.nodeRepository.nodes().getNodes(NodeType.host, Node.State.ready).size()); + assertEquals( 0, tester.nodeRepository.nodes().getNodes(NodeType.host, Node.State.failed).size()); } // Two ready nodes and a ready docker node die, but only 2 of those are failed out tester.clock.advance(Duration.ofMinutes(180)); - Node dockerHost = tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).iterator().next(); + Node dockerHost = tester.nodeRepository.nodes().getNodes(NodeType.host, Node.State.ready).iterator().next(); tester.allNodesMakeAConfigRequestExcept(dockerHost); tester.runMaintainers(); - assertEquals( 3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size()); - assertEquals( 0, tester.nodeRepository.getNodes(NodeType.host, Node.State.failed).size()); + assertEquals( 3, tester.nodeRepository.nodes().getNodes(NodeType.host, Node.State.ready).size()); + assertEquals( 0, tester.nodeRepository.nodes().getNodes(NodeType.host, Node.State.failed).size()); } @Test @@ -441,9 +441,9 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); + assertEquals(8, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(13, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(7, tester.nodeRepository.nodes().getNodes(NodeType.host, Node.State.active).size()); } @@ -457,9 +457,9 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); assertEquals(0, tester.deployer.redeployments); - assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); + assertEquals(8, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(13, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(7, tester.nodeRepository.nodes().getNodes(NodeType.host, Node.State.active).size()); } tester.clock.advance(Duration.ofMinutes(30)); @@ -467,14 +467,14 @@ public class NodeFailerTest { tester.runMaintainers(); assertEquals(2 + 1, tester.deployer.redeployments); - assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals(10, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); + assertEquals(3, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(8, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(10, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(6, tester.nodeRepository.nodes().getNodes(NodeType.host, Node.State.active).size()); // Now lets fail an active tenant node - Node downTenant1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).get(0); + Node downTenant1 = tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).get(0); tester.serviceMonitor.setHostDown(downTenant1.hostname()); // nothing happens during the entire day because of the failure throttling @@ -482,7 +482,7 @@ public class NodeFailerTest { tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); - assertEquals(3 + 1, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(3 + 1, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); } tester.clock.advance(Duration.ofMinutes(30)); @@ -490,10 +490,10 @@ public class NodeFailerTest { tester.runMaintainers(); assertEquals(3 + 1, tester.deployer.redeployments); - assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); + assertEquals(4, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(8, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(9, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(6, tester.nodeRepository.nodes().getNodes(NodeType.host, Node.State.active).size()); // Lets fail another host, make sure it is not the same where downTenant1 is a child @@ -505,10 +505,10 @@ public class NodeFailerTest { tester.runMaintainers(); assertEquals(5 + 2, tester.deployer.redeployments); - assertEquals(7, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals(6, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); + assertEquals(7, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(8, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(6, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(5, tester.nodeRepository.nodes().getNodes(NodeType.host, Node.State.active).size()); // We have only 5 hosts remaining, so if we fail another host, we should only be able to redeploy app1's // node, while app2's should remain @@ -520,10 +520,10 @@ public class NodeFailerTest { tester.runMaintainers(); assertEquals(6 + 2, tester.deployer.redeployments); - assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); + assertEquals(9, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(8, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(4, tester.nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(5, tester.nodeRepository.nodes().getNodes(NodeType.host, Node.State.active).size()); } @Test @@ -545,7 +545,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); - assertEquals(count, tester.nodeRepository.getNodes(nodeType, Node.State.active).size()); + assertEquals(count, tester.nodeRepository.nodes().getNodes(nodeType, Node.State.active).size()); } Set<String> downHosts = Set.of("host2", "host3"); @@ -558,7 +558,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); assertEquals( 0, tester.deployer.redeployments); - assertEquals(count, tester.nodeRepository.getNodes(nodeType, Node.State.active).size()); + assertEquals(count, tester.nodeRepository.nodes().getNodes(nodeType, Node.State.active).size()); } tester.clock.advance(Duration.ofMinutes(60)); @@ -566,15 +566,15 @@ public class NodeFailerTest { // one down host should now be failed, but not two as we are only allowed to fail one proxy assertEquals(expectedFailCount, tester.deployer.redeployments); - assertEquals(count - expectedFailCount, tester.nodeRepository.getNodes(nodeType, Node.State.active).size()); - assertEquals(expectedFailCount, tester.nodeRepository.getNodes(nodeType, Node.State.failed).size()); - tester.nodeRepository.getNodes(nodeType, Node.State.failed) + assertEquals(count - expectedFailCount, tester.nodeRepository.nodes().getNodes(nodeType, Node.State.active).size()); + assertEquals(expectedFailCount, tester.nodeRepository.nodes().getNodes(nodeType, Node.State.failed).size()); + tester.nodeRepository.nodes().getNodes(nodeType, Node.State.failed) .forEach(node -> assertTrue(downHosts.contains(node.hostname()))); // trying to fail again will still not fail the other down host tester.clock.advance(Duration.ofMinutes(60)); tester.runMaintainers(); - assertEquals(count - expectedFailCount, tester.nodeRepository.getNodes(nodeType, Node.State.active).size()); + assertEquals(count - expectedFailCount, tester.nodeRepository.nodes().getNodes(nodeType, Node.State.active).size()); } @Test @@ -586,10 +586,10 @@ public class NodeFailerTest { tester.runMaintainers(); assertEquals(Node.State.ready, readyNode.state()); - tester.nodeRepository.write(readyNode.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); + tester.nodeRepository.nodes().write(readyNode.with(new Reports().withReport(badTotalMemorySizeReport)), () -> {}); tester.runMaintainers(); - assertEquals(1, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(1, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); } @Test @@ -599,7 +599,7 @@ public class NodeFailerTest { // 50 regular tenant nodes, 10 hosts with each 3 tenant nodes, total 90 nodes NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(10); List<Node> readyNodes = tester.createReadyNodes(50, 30); - List<Node> hosts = tester.nodeRepository.getNodes(NodeType.host); + List<Node> hosts = tester.nodeRepository.nodes().getNodes(NodeType.host); List<Node> deadNodes = readyNodes.subList(0, 4); // 2 hours pass, 4 physical nodes die @@ -610,7 +610,7 @@ public class NodeFailerTest { // 2 nodes are failed (the minimum amount that are always allowed to fail) tester.runMaintainers(); - assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(2, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -620,7 +620,7 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.runMaintainers(); - assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(2, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -630,7 +630,7 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.runMaintainers(); - assertEquals(4, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(4, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); // 24 more hours pass, nothing happens for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { @@ -652,7 +652,7 @@ public class NodeFailerTest { assertEquals(4 + /* already failed */ 2 + /* hosts */ (2 * 3) /* containers per host */, - tester.nodeRepository.getNodes(Node.State.failed).size()); + tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled host failures", 1, tester.metric.values.get(NodeFailer.throttledHostFailuresMetric)); @@ -662,14 +662,14 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.runMaintainers(); - assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(12, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled host failures", 1, tester.metric.values.get(NodeFailer.throttledHostFailuresMetric)); // The final host and its containers are failed out tester.clock.advance(Duration.ofMinutes(30)); tester.runMaintainers(); - assertEquals(16, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(16, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -677,7 +677,7 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(deadNodes); tester.runMaintainers(); - assertEquals(16, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(16, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); } @@ -695,7 +695,7 @@ public class NodeFailerTest { } tester.runMaintainers(); // 2% are allowed to fail - assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(10, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -705,7 +705,7 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.runMaintainers(); - assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(10, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -715,7 +715,7 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.runMaintainers(); - assertEquals(15, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(15, tester.nodeRepository.nodes().getNodes(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); } @@ -758,7 +758,7 @@ public class NodeFailerTest { */ private static String selectFirstParentHostWithNActiveNodesExcept(NodeRepository nodeRepository, int n, String... except) { Set<String> exceptSet = Arrays.stream(except).collect(Collectors.toSet()); - return nodeRepository.getNodes(NodeType.tenant, Node.State.active).stream() + return nodeRepository.nodes().getNodes(NodeType.tenant, Node.State.active).stream() .collect(Collectors.groupingBy(Node::parentHostname)) .entrySet().stream() .filter(entry -> entry.getValue().size() == n) |