summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2021-07-15 13:55:10 +0200
committerMartin Polden <mpolden@mpolden.no>2021-07-15 13:57:50 +0200
commit6af8b75f47a7ef458d9917ee10b020b604490db7 (patch)
tree4c379c963df5fec87d854152bc72975e74e766b5 /node-repository
parentb7775c901add178a1773eb25a576333e2096a107 (diff)
Account for groups when deciding if node is replacement
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java13
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java9
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java26
3 files changed, 39 insertions, 9 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
index 695f0dd8659..5fe10f09f8a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
@@ -76,7 +76,7 @@ public class NodePrioritizer {
// In dynamically provisioned zones, we can always take spare hosts since we can provision new on-demand,
// NodeCandidate::compareTo will ensure that they will not be used until there is no room elsewhere.
// In non-dynamically provisioned zones, we only allow allocating to spare hosts to replace failed nodes.
- this.canAllocateToSpareHosts = dynamicProvisioning || isReplacement(nodesInCluster);
+ this.canAllocateToSpareHosts = dynamicProvisioning || isReplacement(nodesInCluster, clusterSpec.group());
// Do not allocate new nodes for exclusive deployments in dynamically provisioned zones: provision new host instead.
this.canAllocateNew = requestedNodes instanceof NodeSpec.CountNodeSpec
&& (!dynamicProvisioning || !requestedNodes.isExclusive());
@@ -195,10 +195,13 @@ public class NodePrioritizer {
}
/** Returns whether we are allocating to replace a failed node */
- private boolean isReplacement(NodeList nodesInCluster) {
- int failedNodesInCluster = nodesInCluster.failing().size() + nodesInCluster.state(Node.State.failed).size();
- if (failedNodesInCluster == 0) return false;
- return ! requestedNodes.fulfilledBy(nodesInCluster.size() - failedNodesInCluster);
+ private boolean isReplacement(NodeList nodesInCluster, Optional<ClusterSpec.Group> group) {
+ NodeList nodesInGroup = group.map(ClusterSpec.Group::index)
+ .map(nodesInCluster::group)
+ .orElse(nodesInCluster);
+ int failedNodesInGroup = nodesInGroup.failing().size() + nodesInGroup.state(Node.State.failed).size();
+ if (failedNodesInGroup == 0) return false;
+ return ! requestedNodes.fulfilledBy(nodesInGroup.size() - failedNodesInGroup);
}
/**
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
index 9ded28094d2..774350084ac 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
@@ -148,11 +148,12 @@ public class NodeFailTester {
}
public static NodeFailTester withOneUndeployedApplication(Capacity capacity) {
- NodeFailTester tester = new NodeFailTester();
+ return withOneUndeployedApplication(capacity, ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.42").build());
+ }
- // Create applications
- ClusterSpec clusterApp = ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.42").build();
- Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(app1, new MockDeployer.ApplicationContext(app1, clusterApp, capacity));
+ public static NodeFailTester withOneUndeployedApplication(Capacity capacity, ClusterSpec spec) {
+ NodeFailTester tester = new NodeFailTester();
+ Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(app1, new MockDeployer.ApplicationContext(app1, spec, capacity));
tester.initializeMaintainers(apps);
return tester;
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 7eff8af8b8d..dfeb82281e6 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -377,6 +377,32 @@ public class NodeFailerTest {
}
@Test
+ public void node_failing_can_allocate_spare_to_replace_failed_node_in_group() {
+ NodeResources resources = new NodeResources(1, 20, 15, 1);
+ Capacity capacity = Capacity.from(new ClusterResources(4, 2, resources), false, true);
+ ClusterSpec spec = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test")).vespaVersion("6.42").build();
+ NodeFailTester tester = NodeFailTester.withOneUndeployedApplication(capacity, spec);
+ assertEquals("Test depends on this setting in NodeFailTester", 1, tester.nodeRepository.spareCount());
+ tester.createAndActivateHosts(5, resources); // One extra - becomes designated spare
+ tester.activate(NodeFailTester.app1, spec, capacity);
+
+ // Hardware failure is reported for host
+ NodeList activeNodes = tester.nodeRepository.nodes().list(Node.State.active);
+ Node downNode = activeNodes.owner(NodeFailTester.app1).first().get();
+ Node downHost = activeNodes.parentOf(downNode).get();
+ tester.tester.patchNode(downHost, (node) -> node.with(node.reports().withReport(badTotalMemorySizeReport)));
+ tester.suspend(downHost.hostname());
+ tester.suspend(downNode.hostname());
+
+ // Node is failed and replaced
+ tester.runMaintainers();
+ assertEquals(1, tester.deployer.redeployments);
+ NodeList failedOrActive = tester.nodeRepository.nodes().list(Node.State.active, Node.State.failed);
+ assertEquals(4, failedOrActive.state(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals(Set.of(downNode.hostname()), failedOrActive.state(Node.State.failed).nodeType(NodeType.tenant).hostnames());
+ }
+
+ @Test
public void failing_ready_nodes() {
NodeFailTester tester = NodeFailTester.withTwoApplications();