diff options
Diffstat (limited to 'node-repository')
5 files changed, 65 insertions, 114 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index f16459ee8b9..95f97c8a8e6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -155,17 +155,13 @@ public class NodeFailer extends NodeRepositoryMaintainer { Map<Node, String> nodesByFailureReason = new HashMap<>(); for (Node node : nodeRepository().nodes().list(Node.State.ready)) { - if (expectConfigRequests(node) && ! hasNodeRequestedConfigAfter(node, oldestAcceptableRequestTime)) { - nodesByFailureReason.put(node, "Not receiving config requests from node"); - } else { - Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node); - List<String> failureReports = reasonsToFailParentHost(hostNode); - if (failureReports.size() > 0) { - if (hostNode.equals(node)) { - nodesByFailureReason.put(node, "Host has failure reports: " + failureReports); - } else { - nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports); - } + Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node); + List<String> failureReports = reasonsToFailParentHost(hostNode); + if (failureReports.size() > 0) { + if (hostNode.equals(node)) { + nodesByFailureReason.put(node, "Host has failure reports: " + failureReports); + } else { + nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports); } } } @@ -336,6 +332,9 @@ public class NodeFailer extends NodeRepositoryMaintainer { .matching(n -> n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow)); + log.info("node = " + node + ", recentlyFailedNodes.size() = " + recentlyFailedNodes.size() + + ", throttlePolicy.allowedToFailOf(" + allNodes.size() + ") = " + + throttlePolicy.allowedToFailOf(allNodes.size())); // Allow failing any node within policy if (recentlyFailedNodes.size() < throttlePolicy.allowedToFailOf(allNodes.size())) return false; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java index 774350084ac..ad8bfc1a37d 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java @@ -114,6 +114,34 @@ public class NodeFailTester { return tester; } + /** Create hostCount hosts, one app with containerCount containers, and one app with contentCount content nodes. */ + public static NodeFailTester withTwoApplications(int hostCount, int containerCount, int contentCount) { + NodeFailTester tester = new NodeFailTester(); + tester.createHostNodes(hostCount); + + // Create tenant host application + ClusterSpec clusterNodeAdminApp = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("node-admin")).vespaVersion("6.42").build(); + ClusterSpec clusterApp1 = ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.75.0").build(); + ClusterSpec clusterApp2 = ClusterSpec.request(ClusterSpec.Type.content, testCluster).vespaVersion("6.75.0").build(); + Capacity allHosts = Capacity.fromRequiredNodeType(NodeType.host); + Capacity capacity1 = Capacity.from(new ClusterResources(containerCount, 1, new NodeResources(1, 4, 10, 0.3)), false, true); + Capacity capacity2 = Capacity.from(new ClusterResources(contentCount, 1, new NodeResources(1, 4, 10, 0.3)), false, true); + tester.activate(tenantHostApp, clusterNodeAdminApp, allHosts); + tester.activate(app1, clusterApp1, capacity1); + tester.activate(app2, clusterApp2, capacity2); + assertEquals(Set.of(tester.nodeRepository.nodes().list().nodeType(NodeType.host).asList()), + Set.of(tester.nodeRepository.nodes().list(Node.State.active).owner(tenantHostApp).asList())); + assertEquals(capacity1.minResources().nodes(), tester.nodeRepository.nodes().list(Node.State.active).owner(app1).size()); + assertEquals(capacity2.minResources().nodes(), tester.nodeRepository.nodes().list(Node.State.active).owner(app2).size()); + + Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of( + tenantHostApp, new MockDeployer.ApplicationContext(tenantHostApp, clusterNodeAdminApp, allHosts), + app1, new MockDeployer.ApplicationContext(app1, clusterApp1, capacity1), + app2, new MockDeployer.ApplicationContext(app2, clusterApp2, capacity2)); + tester.initializeMaintainers(apps); + return tester; + } + public static NodeFailTester withTwoApplications(int numberOfHosts) { NodeFailTester tester = new NodeFailTester(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index dfeb82281e6..0f0da628d13 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -403,44 +403,6 @@ public class NodeFailerTest { } @Test - public void failing_ready_nodes() { - NodeFailTester tester = NodeFailTester.withTwoApplications(); - - // Add ready node - NodeResources newNodeResources = new NodeResources(3, 4, 5, 1); - tester.createReadyNodes(1, 16, newNodeResources); - - // For a day all nodes work so nothing happens - for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { - tester.clock.advance(Duration.ofMinutes(interval)); - tester.allNodesMakeAConfigRequestExcept(); - tester.runMaintainers(); - assertEquals( 5, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); - } - - NodeList ready = tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant); - - // Two ready hosts and a ready node die, but only 2 of those are failed out - tester.clock.advance(Duration.ofMinutes(180)); - Node failingNode = ready.stream().filter(node -> node.resources().equals(newNodeResources)).findFirst().get(); - List<Node> otherNodes = ready.stream() - .filter(node -> !node.resources().equals(newNodeResources)) - .collect(Collectors.toList()); - tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), failingNode); - tester.runMaintainers(); - assertEquals( 3, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); - assertEquals( 2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - - // Another ready node dies and the node that died earlier, are allowed to fail - tester.clock.advance(Duration.ofDays(1)); - tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), failingNode, otherNodes.get(3)); - tester.runMaintainers(); - assertEquals( 1, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); - assertEquals(otherNodes.get(1), tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).first().get()); - assertEquals( 4, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - } - - @Test public void host_not_failed_without_config_requests() { NodeFailTester tester = NodeFailTester.withTwoApplications(); @@ -627,47 +589,11 @@ public class NodeFailerTest { public void node_failing_throttle() { // Throttles based on a absolute number in small zone { - // 50 regular tenant nodes, 10 hosts with each 3 tenant nodes, total 90 nodes + // 10 hosts with 3 tenant nodes each, total 40 nodes NodeFailTester tester = NodeFailTester.withTwoApplications(10); - List<Node> readyNodes = tester.createReadyNodes(50, 30); NodeList hosts = tester.nodeRepository.nodes().list().nodeType(NodeType.host); - List<Node> deadNodes = readyNodes.subList(0, 4); - - // 2 hours pass, 4 physical nodes die - for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) { - tester.clock.advance(Duration.ofMinutes(interval)); - tester.allNodesMakeAConfigRequestExcept(deadNodes); - } - - // 2 nodes are failed (the minimum amount that are always allowed to fail) - tester.runMaintainers(); - assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); - assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); - - // 6 more hours pass, no more nodes are failed - for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) { - tester.clock.advance(Duration.ofMinutes(interval)); - tester.allNodesMakeAConfigRequestExcept(deadNodes); - } - tester.runMaintainers(); - assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); - assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); - - // 18 more hours pass, the remaining dead nodes are allowed to fail - for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) { - tester.clock.advance(Duration.ofMinutes(interval)); - tester.allNodesMakeAConfigRequestExcept(deadNodes); - } - tester.runMaintainers(); - assertEquals(4, tester.nodeRepository.nodes().list(Node.State.failed).size()); - - // 24 more hours pass, nothing happens - for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { - tester.clock.advance(Duration.ofMinutes(interval)); - tester.allNodesMakeAConfigRequestExcept(deadNodes); - } + // TODO: Remove deadNodes + List<Node> deadNodes = new ArrayList<>(); // 3 hosts fail. 2 of them and all of their children are allowed to fail List<Node> failedHosts = hosts.asList().subList(0, 3); @@ -680,8 +606,7 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); tester.runMaintainers(); - assertEquals(4 + /* already failed */ - 2 + /* hosts */ + assertEquals(2 + /* hosts */ (2 * 3) /* containers per host */, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); @@ -693,14 +618,14 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.runMaintainers(); - assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(8, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled host failures", 1, tester.metric.values.get(NodeFailer.throttledHostFailuresMetric)); // The final host and its containers are failed out tester.clock.advance(Duration.ofMinutes(30)); tester.runMaintainers(); - assertEquals(16, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -708,43 +633,42 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(deadNodes); tester.runMaintainers(); - assertEquals(16, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); } // Throttles based on percentage in large zone { - NodeFailTester tester = NodeFailTester.withNoApplications(); - List<Node> readyNodes = tester.createReadyNodes(500); - List<Node> deadNodes = readyNodes.subList(0, 15); + NodeFailTester tester = NodeFailTester.withTwoApplications(300, 100, 100); + NodeList allNodes = tester.nodeRepository.nodes().list(); + assertEquals(500, allNodes.size()); // 2 hours pass, 15 nodes (3%) die - for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) { - tester.clock.advance(Duration.ofMinutes(interval)); - tester.allNodesMakeAConfigRequestExcept(deadNodes); - } tester.runMaintainers(); + allNodes.state(Node.State.active) + .nodeType(NodeType.tenant) + .stream() + .limit(15) + .forEach(host -> tester.serviceMonitor.setHostDown(host.hostname())); + tester.runMaintainers(); + tester.clock.advance(Duration.ofHours(2)); + tester.runMaintainers(); + // 2% are allowed to fail assertEquals(10, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 6 more hours pass, no more nodes are failed - for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) { - tester.clock.advance(Duration.ofMinutes(interval)); - tester.allNodesMakeAConfigRequestExcept(deadNodes); - } + tester.clock.advance(Duration.ofHours(6)); tester.runMaintainers(); assertEquals(10, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 18 more hours pass, 24 hours since the first 10 nodes were failed. The remaining 5 are failed - for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) { - tester.clock.advance(Duration.ofMinutes(interval)); - tester.allNodesMakeAConfigRequestExcept(deadNodes); - } + tester.clock.advance(Duration.ofHours(18)); tester.runMaintainers(); assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java index 5c452989f56..adf181cadde 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java @@ -310,7 +310,7 @@ public class DynamicAllocationTest { tester.activate(application, hosts); NodeList activeNodes = tester.nodeRepository().nodes().list().owner(application); - assertEquals(Set.of("127.0.127.13", "::13"), activeNodes.asList().get(0).ipConfig().primary()); + assertEquals(Set.of("127.0.127.13", "::d"), activeNodes.asList().get(0).ipConfig().primary()); assertEquals(Set.of("127.0.127.2", "::2"), activeNodes.asList().get(1).ipConfig().primary()); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java index 21e9058cd7c..45cafc3e5cc 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java @@ -411,7 +411,7 @@ public class ProvisioningTester { // One test involves two provision testers - to detect this we check if the // name resolver already contains the next host - if this is the case - bump the indices and move on - String testIp = String.format("127.0.0.%d", nextIP); + String testIp = String.format("127.0.%d.%d", nextIP / 256, nextIP % 256); MockNameResolver nameResolver = (MockNameResolver)nodeRepository().nameResolver(); if (nameResolver.resolveHostname(testIp).isPresent()) { nextHost += 100; @@ -419,8 +419,8 @@ public class ProvisioningTester { } String hostname = nodeNamer.apply(nextHost); - String ipv4 = String.format("127.0.0.%d", nextIP); - String ipv6 = String.format("::%d", nextIP); + String ipv4 = String.format("127.0.%d.%d", nextIP / 256, nextIP % 256); + String ipv6 = String.format("::%x", nextIP); nameResolver.addRecord(hostname, ipv4, ipv6); HashSet<String> hostIps = new HashSet<>(); @@ -430,11 +430,11 @@ public class ProvisioningTester { Set<String> ipAddressPool = new LinkedHashSet<>(); for (int poolIp = 1; poolIp <= ipAddressPoolSize; poolIp++) { nextIP++; - String ipv6Addr = String.format("::%d", nextIP); + String ipv6Addr = String.format("::%x", nextIP); ipAddressPool.add(ipv6Addr); nameResolver.addRecord(String.format("node-%d-of-%s", poolIp, hostname), ipv6Addr); if (dualStack) { - String ipv4Addr = String.format("127.0.127.%d", nextIP); + String ipv4Addr = String.format("127.0.%d.%d", (127 + (nextIP / 256)), nextIP % 256); ipAddressPool.add(ipv4Addr); nameResolver.addRecord(String.format("node-%d-of-%s", poolIp, hostname), ipv4Addr); } |