summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHÃ¥kon Hallingstad <hakon@verizonmedia.com>2021-08-16 18:10:24 +0200
committerGitHub <noreply@github.com>2021-08-16 18:10:24 +0200
commit249377274b11826daffe0ae8c2bf3424e758dadb (patch)
treea7c0cc69935c7a05aafc2254ac2f36e19a983461
parent3f38899f1fe9a9576fbc027ccd282d4e177e756c (diff)
parent9faa27eadc810013d55c31f7766a35c292f9dc52 (diff)
Merge pull request #18758 from vespa-engine/hakonhall/do-not-fail-ready-nodes-wo-recent-config-requests
Do not fail ready nodes w/o recent config requests
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java18
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java28
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java120
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java10
5 files changed, 60 insertions, 118 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index f16459ee8b9..3c58db5aee3 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -155,17 +155,13 @@ public class NodeFailer extends NodeRepositoryMaintainer {
Map<Node, String> nodesByFailureReason = new HashMap<>();
for (Node node : nodeRepository().nodes().list(Node.State.ready)) {
- if (expectConfigRequests(node) && ! hasNodeRequestedConfigAfter(node, oldestAcceptableRequestTime)) {
- nodesByFailureReason.put(node, "Not receiving config requests from node");
- } else {
- Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node);
- List<String> failureReports = reasonsToFailParentHost(hostNode);
- if (failureReports.size() > 0) {
- if (hostNode.equals(node)) {
- nodesByFailureReason.put(node, "Host has failure reports: " + failureReports);
- } else {
- nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports);
- }
+ Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node);
+ List<String> failureReports = reasonsToFailParentHost(hostNode);
+ if (failureReports.size() > 0) {
+ if (hostNode.equals(node)) {
+ nodesByFailureReason.put(node, "Host has failure reports: " + failureReports);
+ } else {
+ nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports);
}
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
index 774350084ac..ad8bfc1a37d 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
@@ -114,6 +114,34 @@ public class NodeFailTester {
return tester;
}
+ /** Create hostCount hosts, one app with containerCount containers, and one app with contentCount content nodes. */
+ public static NodeFailTester withTwoApplications(int hostCount, int containerCount, int contentCount) {
+ NodeFailTester tester = new NodeFailTester();
+ tester.createHostNodes(hostCount);
+
+ // Create tenant host application
+ ClusterSpec clusterNodeAdminApp = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("node-admin")).vespaVersion("6.42").build();
+ ClusterSpec clusterApp1 = ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.75.0").build();
+ ClusterSpec clusterApp2 = ClusterSpec.request(ClusterSpec.Type.content, testCluster).vespaVersion("6.75.0").build();
+ Capacity allHosts = Capacity.fromRequiredNodeType(NodeType.host);
+ Capacity capacity1 = Capacity.from(new ClusterResources(containerCount, 1, new NodeResources(1, 4, 10, 0.3)), false, true);
+ Capacity capacity2 = Capacity.from(new ClusterResources(contentCount, 1, new NodeResources(1, 4, 10, 0.3)), false, true);
+ tester.activate(tenantHostApp, clusterNodeAdminApp, allHosts);
+ tester.activate(app1, clusterApp1, capacity1);
+ tester.activate(app2, clusterApp2, capacity2);
+ assertEquals(Set.of(tester.nodeRepository.nodes().list().nodeType(NodeType.host).asList()),
+ Set.of(tester.nodeRepository.nodes().list(Node.State.active).owner(tenantHostApp).asList()));
+ assertEquals(capacity1.minResources().nodes(), tester.nodeRepository.nodes().list(Node.State.active).owner(app1).size());
+ assertEquals(capacity2.minResources().nodes(), tester.nodeRepository.nodes().list(Node.State.active).owner(app2).size());
+
+ Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(
+ tenantHostApp, new MockDeployer.ApplicationContext(tenantHostApp, clusterNodeAdminApp, allHosts),
+ app1, new MockDeployer.ApplicationContext(app1, clusterApp1, capacity1),
+ app2, new MockDeployer.ApplicationContext(app2, clusterApp2, capacity2));
+ tester.initializeMaintainers(apps);
+ return tester;
+ }
+
public static NodeFailTester withTwoApplications(int numberOfHosts) {
NodeFailTester tester = new NodeFailTester();
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index dfeb82281e6..2bd0c91f4a1 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -403,44 +403,6 @@ public class NodeFailerTest {
}
@Test
- public void failing_ready_nodes() {
- NodeFailTester tester = NodeFailTester.withTwoApplications();
-
- // Add ready node
- NodeResources newNodeResources = new NodeResources(3, 4, 5, 1);
- tester.createReadyNodes(1, 16, newNodeResources);
-
- // For a day all nodes work so nothing happens
- for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
- tester.clock.advance(Duration.ofMinutes(interval));
- tester.allNodesMakeAConfigRequestExcept();
- tester.runMaintainers();
- assertEquals( 5, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
- }
-
- NodeList ready = tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant);
-
- // Two ready hosts and a ready node die, but only 2 of those are failed out
- tester.clock.advance(Duration.ofMinutes(180));
- Node failingNode = ready.stream().filter(node -> node.resources().equals(newNodeResources)).findFirst().get();
- List<Node> otherNodes = ready.stream()
- .filter(node -> !node.resources().equals(newNodeResources))
- .collect(Collectors.toList());
- tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), failingNode);
- tester.runMaintainers();
- assertEquals( 3, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
- assertEquals( 2, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
-
- // Another ready node dies and the node that died earlier, are allowed to fail
- tester.clock.advance(Duration.ofDays(1));
- tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), failingNode, otherNodes.get(3));
- tester.runMaintainers();
- assertEquals( 1, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
- assertEquals(otherNodes.get(1), tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).first().get());
- assertEquals( 4, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
- }
-
- @Test
public void host_not_failed_without_config_requests() {
NodeFailTester tester = NodeFailTester.withTwoApplications();
@@ -627,61 +589,20 @@ public class NodeFailerTest {
public void node_failing_throttle() {
// Throttles based on a absolute number in small zone
{
- // 50 regular tenant nodes, 10 hosts with each 3 tenant nodes, total 90 nodes
+ // 10 hosts with 3 tenant nodes each, total 40 nodes
NodeFailTester tester = NodeFailTester.withTwoApplications(10);
- List<Node> readyNodes = tester.createReadyNodes(50, 30);
NodeList hosts = tester.nodeRepository.nodes().list().nodeType(NodeType.host);
- List<Node> deadNodes = readyNodes.subList(0, 4);
-
- // 2 hours pass, 4 physical nodes die
- for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) {
- tester.clock.advance(Duration.ofMinutes(interval));
- tester.allNodesMakeAConfigRequestExcept(deadNodes);
- }
-
- // 2 nodes are failed (the minimum amount that are always allowed to fail)
- tester.runMaintainers();
- assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
- assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
-
- // 6 more hours pass, no more nodes are failed
- for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) {
- tester.clock.advance(Duration.ofMinutes(interval));
- tester.allNodesMakeAConfigRequestExcept(deadNodes);
- }
- tester.runMaintainers();
- assertEquals(2, tester.nodeRepository.nodes().list(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
- assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
-
- // 18 more hours pass, the remaining dead nodes are allowed to fail
- for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) {
- tester.clock.advance(Duration.ofMinutes(interval));
- tester.allNodesMakeAConfigRequestExcept(deadNodes);
- }
- tester.runMaintainers();
- assertEquals(4, tester.nodeRepository.nodes().list(Node.State.failed).size());
-
- // 24 more hours pass, nothing happens
- for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
- tester.clock.advance(Duration.ofMinutes(interval));
- tester.allNodesMakeAConfigRequestExcept(deadNodes);
- }
// 3 hosts fail. 2 of them and all of their children are allowed to fail
List<Node> failedHosts = hosts.asList().subList(0, 3);
failedHosts.forEach(host -> {
tester.serviceMonitor.setHostDown(host.hostname());
- deadNodes.add(host);
});
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(61));
- tester.allNodesMakeAConfigRequestExcept(deadNodes);
tester.runMaintainers();
- assertEquals(4 + /* already failed */
- 2 + /* hosts */
+ assertEquals(2 + /* hosts */
(2 * 3) /* containers per host */,
tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
@@ -690,61 +611,58 @@ public class NodeFailerTest {
// 24 more hours pass without any other nodes being failed out
for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) {
tester.clock.advance(Duration.ofMinutes(interval));
- tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
tester.runMaintainers();
- assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ assertEquals(8, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled host failures", 1, tester.metric.values.get(NodeFailer.throttledHostFailuresMetric));
// The final host and its containers are failed out
tester.clock.advance(Duration.ofMinutes(30));
tester.runMaintainers();
- assertEquals(16, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// Nothing else to fail
tester.clock.advance(Duration.ofHours(25));
- tester.allNodesMakeAConfigRequestExcept(deadNodes);
tester.runMaintainers();
- assertEquals(16, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
}
// Throttles based on percentage in large zone
{
- NodeFailTester tester = NodeFailTester.withNoApplications();
- List<Node> readyNodes = tester.createReadyNodes(500);
- List<Node> deadNodes = readyNodes.subList(0, 15);
+ NodeFailTester tester = NodeFailTester.withTwoApplications(300, 100, 100);
+ NodeList allNodes = tester.nodeRepository.nodes().list();
+ assertEquals(500, allNodes.size());
// 2 hours pass, 15 nodes (3%) die
- for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) {
- tester.clock.advance(Duration.ofMinutes(interval));
- tester.allNodesMakeAConfigRequestExcept(deadNodes);
- }
tester.runMaintainers();
+ allNodes.state(Node.State.active)
+ .nodeType(NodeType.tenant)
+ .stream()
+ .limit(15)
+ .forEach(host -> tester.serviceMonitor.setHostDown(host.hostname()));
+ tester.runMaintainers();
+ tester.clock.advance(Duration.ofHours(2));
+ tester.runMaintainers();
+
// 2% are allowed to fail
assertEquals(10, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 6 more hours pass, no more nodes are failed
- for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) {
- tester.clock.advance(Duration.ofMinutes(interval));
- tester.allNodesMakeAConfigRequestExcept(deadNodes);
- }
+ tester.clock.advance(Duration.ofHours(6));
tester.runMaintainers();
assertEquals(10, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 18 more hours pass, 24 hours since the first 10 nodes were failed. The remaining 5 are failed
- for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) {
- tester.clock.advance(Duration.ofMinutes(interval));
- tester.allNodesMakeAConfigRequestExcept(deadNodes);
- }
+ tester.clock.advance(Duration.ofHours(18));
tester.runMaintainers();
assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java
index 5c452989f56..adf181cadde 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java
@@ -310,7 +310,7 @@ public class DynamicAllocationTest {
tester.activate(application, hosts);
NodeList activeNodes = tester.nodeRepository().nodes().list().owner(application);
- assertEquals(Set.of("127.0.127.13", "::13"), activeNodes.asList().get(0).ipConfig().primary());
+ assertEquals(Set.of("127.0.127.13", "::d"), activeNodes.asList().get(0).ipConfig().primary());
assertEquals(Set.of("127.0.127.2", "::2"), activeNodes.asList().get(1).ipConfig().primary());
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
index 21e9058cd7c..45cafc3e5cc 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
@@ -411,7 +411,7 @@ public class ProvisioningTester {
// One test involves two provision testers - to detect this we check if the
// name resolver already contains the next host - if this is the case - bump the indices and move on
- String testIp = String.format("127.0.0.%d", nextIP);
+ String testIp = String.format("127.0.%d.%d", nextIP / 256, nextIP % 256);
MockNameResolver nameResolver = (MockNameResolver)nodeRepository().nameResolver();
if (nameResolver.resolveHostname(testIp).isPresent()) {
nextHost += 100;
@@ -419,8 +419,8 @@ public class ProvisioningTester {
}
String hostname = nodeNamer.apply(nextHost);
- String ipv4 = String.format("127.0.0.%d", nextIP);
- String ipv6 = String.format("::%d", nextIP);
+ String ipv4 = String.format("127.0.%d.%d", nextIP / 256, nextIP % 256);
+ String ipv6 = String.format("::%x", nextIP);
nameResolver.addRecord(hostname, ipv4, ipv6);
HashSet<String> hostIps = new HashSet<>();
@@ -430,11 +430,11 @@ public class ProvisioningTester {
Set<String> ipAddressPool = new LinkedHashSet<>();
for (int poolIp = 1; poolIp <= ipAddressPoolSize; poolIp++) {
nextIP++;
- String ipv6Addr = String.format("::%d", nextIP);
+ String ipv6Addr = String.format("::%x", nextIP);
ipAddressPool.add(ipv6Addr);
nameResolver.addRecord(String.format("node-%d-of-%s", poolIp, hostname), ipv6Addr);
if (dualStack) {
- String ipv4Addr = String.format("127.0.127.%d", nextIP);
+ String ipv4Addr = String.format("127.0.%d.%d", (127 + (nextIP / 256)), nextIP % 256);
ipAddressPool.add(ipv4Addr);
nameResolver.addRecord(String.format("node-%d-of-%s", poolIp, hostname), ipv4Addr);
}